default	rel
%define XMMWORD
%define YMMWORD
%define ZMMWORD
section	.text code align=64


EXTERN	OPENSSL_ia32cap_P
global	aesni_cbc_sha256_enc

ALIGN	16
aesni_cbc_sha256_enc:
	lea	r11,[OPENSSL_ia32cap_P]
	mov	eax,1
	cmp	rcx,0
	je	NEAR $L$probe
	mov	eax,DWORD[r11]
	mov	r10,QWORD[4+r11]
	bt	r10,61
	jc	NEAR aesni_cbc_sha256_enc_shaext
	mov	r11,r10
	shr	r11,32

	test	r10d,2048
	jnz	NEAR aesni_cbc_sha256_enc_xop
	and	r11d,296
	cmp	r11d,296
	je	NEAR aesni_cbc_sha256_enc_avx2
	and	r10d,268435456
	jnz	NEAR aesni_cbc_sha256_enc_avx
	ud2
	xor	eax,eax
	cmp	rcx,0
	je	NEAR $L$probe
	ud2
$L$probe:
	DB	0F3h,0C3h		;repret


ALIGN	64

K256:
	DD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
	DD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
	DD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
	DD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
	DD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
	DD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
	DD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
	DD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
	DD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
	DD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
	DD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
	DD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
	DD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
	DD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
	DD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
	DD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
	DD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
	DD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
	DD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
	DD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
	DD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
	DD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
	DD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
	DD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
	DD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
	DD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
	DD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
	DD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
	DD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
	DD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
	DD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
	DD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2

	DD	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
	DD	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
	DD	0,0,0,0,0,0,0,0,-1,-1,-1,-1
	DD	0,0,0,0,0,0,0,0
DB	65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54
DB	32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95
DB	54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98
DB	121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108
DB	46,111,114,103,62,0
ALIGN	64

ALIGN	64
aesni_cbc_sha256_enc_xop:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aesni_cbc_sha256_enc_xop:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8
	mov	rcx,r9
	mov	r8,QWORD[40+rsp]
	mov	r9,QWORD[48+rsp]


$L$xop_shortcut:
	mov	r10,QWORD[56+rsp]
	push	rbx
	push	rbp
	push	r12
	push	r13
	push	r14
	push	r15
	mov	r11,rsp
	sub	rsp,288
	and	rsp,-64

	shl	rdx,6
	sub	rsi,rdi
	sub	r10,rdi
	add	rdx,rdi


	mov	QWORD[((64+8))+rsp],rsi
	mov	QWORD[((64+16))+rsp],rdx

	mov	QWORD[((64+32))+rsp],r8
	mov	QWORD[((64+40))+rsp],r9
	mov	QWORD[((64+48))+rsp],r10
	mov	QWORD[((64+56))+rsp],r11
	movaps	XMMWORD[128+rsp],xmm6
	movaps	XMMWORD[144+rsp],xmm7
	movaps	XMMWORD[160+rsp],xmm8
	movaps	XMMWORD[176+rsp],xmm9
	movaps	XMMWORD[192+rsp],xmm10
	movaps	XMMWORD[208+rsp],xmm11
	movaps	XMMWORD[224+rsp],xmm12
	movaps	XMMWORD[240+rsp],xmm13
	movaps	XMMWORD[256+rsp],xmm14
	movaps	XMMWORD[272+rsp],xmm15
$L$prologue_xop:
	vzeroall

	mov	r12,rdi
	lea	rdi,[128+rcx]
	lea	r13,[((K256+544))]
	mov	r14d,DWORD[((240-128))+rdi]
	mov	r15,r9
	mov	rsi,r10
	vmovdqu	xmm8,XMMWORD[r8]
	sub	r14,9

	mov	eax,DWORD[r15]
	mov	ebx,DWORD[4+r15]
	mov	ecx,DWORD[8+r15]
	mov	edx,DWORD[12+r15]
	mov	r8d,DWORD[16+r15]
	mov	r9d,DWORD[20+r15]
	mov	r10d,DWORD[24+r15]
	mov	r11d,DWORD[28+r15]

	vmovdqa	xmm14,XMMWORD[r14*8+r13]
	vmovdqa	xmm13,XMMWORD[16+r14*8+r13]
	vmovdqa	xmm12,XMMWORD[32+r14*8+r13]
	vmovdqu	xmm10,XMMWORD[((0-128))+rdi]
	jmp	NEAR $L$loop_xop
ALIGN	16
$L$loop_xop:
	vmovdqa	xmm7,XMMWORD[((K256+512))]
	vmovdqu	xmm0,XMMWORD[r12*1+rsi]
	vmovdqu	xmm1,XMMWORD[16+r12*1+rsi]
	vmovdqu	xmm2,XMMWORD[32+r12*1+rsi]
	vmovdqu	xmm3,XMMWORD[48+r12*1+rsi]
	vpshufb	xmm0,xmm0,xmm7
	lea	rbp,[K256]
	vpshufb	xmm1,xmm1,xmm7
	vpshufb	xmm2,xmm2,xmm7
	vpaddd	xmm4,xmm0,XMMWORD[rbp]
	vpshufb	xmm3,xmm3,xmm7
	vpaddd	xmm5,xmm1,XMMWORD[32+rbp]
	vpaddd	xmm6,xmm2,XMMWORD[64+rbp]
	vpaddd	xmm7,xmm3,XMMWORD[96+rbp]
	vmovdqa	XMMWORD[rsp],xmm4
	mov	r14d,eax
	vmovdqa	XMMWORD[16+rsp],xmm5
	mov	esi,ebx
	vmovdqa	XMMWORD[32+rsp],xmm6
	xor	esi,ecx
	vmovdqa	XMMWORD[48+rsp],xmm7
	mov	r13d,r8d
	jmp	NEAR $L$xop_00_47

ALIGN	16
$L$xop_00_47:
	sub	rbp,-16*2*4
	vmovdqu	xmm9,XMMWORD[r12]
	mov	QWORD[((64+0))+rsp],r12
	vpalignr	xmm4,xmm1,xmm0,4
	ror	r13d,14
	mov	eax,r14d
	vpalignr	xmm7,xmm3,xmm2,4
	mov	r12d,r9d
	xor	r13d,r8d
DB	143,232,120,194,236,14
	ror	r14d,9
	xor	r12d,r10d
	vpsrld	xmm4,xmm4,3
	ror	r13d,5
	xor	r14d,eax
	vpaddd	xmm0,xmm0,xmm7
	and	r12d,r8d
	vpxor	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((16-128))+rdi]
	xor	r13d,r8d
	add	r11d,DWORD[rsp]
	mov	r15d,eax
DB	143,232,120,194,245,11
	ror	r14d,11
	xor	r12d,r10d
	vpxor	xmm4,xmm4,xmm5
	xor	r15d,ebx
	ror	r13d,6
	add	r11d,r12d
	and	esi,r15d
DB	143,232,120,194,251,13
	xor	r14d,eax
	add	r11d,r13d
	vpxor	xmm4,xmm4,xmm6
	xor	esi,ebx
	add	edx,r11d
	vpsrld	xmm6,xmm3,10
	ror	r14d,2
	add	r11d,esi
	vpaddd	xmm0,xmm0,xmm4
	mov	r13d,edx
	add	r14d,r11d
DB	143,232,120,194,239,2
	ror	r13d,14
	mov	r11d,r14d
	vpxor	xmm7,xmm7,xmm6
	mov	r12d,r8d
	xor	r13d,edx
	ror	r14d,9
	xor	r12d,r9d
	vpxor	xmm7,xmm7,xmm5
	ror	r13d,5
	xor	r14d,r11d
	and	r12d,edx
	vpxor	xmm9,xmm9,xmm8
	xor	r13d,edx
	vpsrldq	xmm7,xmm7,8
	add	r10d,DWORD[4+rsp]
	mov	esi,r11d
	ror	r14d,11
	xor	r12d,r9d
	vpaddd	xmm0,xmm0,xmm7
	xor	esi,eax
	ror	r13d,6
	add	r10d,r12d
	and	r15d,esi
DB	143,232,120,194,248,13
	xor	r14d,r11d
	add	r10d,r13d
	vpsrld	xmm6,xmm0,10
	xor	r15d,eax
	add	ecx,r10d
DB	143,232,120,194,239,2
	ror	r14d,2
	add	r10d,r15d
	vpxor	xmm7,xmm7,xmm6
	mov	r13d,ecx
	add	r14d,r10d
	ror	r13d,14
	mov	r10d,r14d
	vpxor	xmm7,xmm7,xmm5
	mov	r12d,edx
	xor	r13d,ecx
	ror	r14d,9
	xor	r12d,r8d
	vpslldq	xmm7,xmm7,8
	ror	r13d,5
	xor	r14d,r10d
	and	r12d,ecx
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((32-128))+rdi]
	xor	r13d,ecx
	vpaddd	xmm0,xmm0,xmm7
	add	r9d,DWORD[8+rsp]
	mov	r15d,r10d
	ror	r14d,11
	xor	r12d,r8d
	vpaddd	xmm6,xmm0,XMMWORD[rbp]
	xor	r15d,r11d
	ror	r13d,6
	add	r9d,r12d
	and	esi,r15d
	xor	r14d,r10d
	add	r9d,r13d
	xor	esi,r11d
	add	ebx,r9d
	ror	r14d,2
	add	r9d,esi
	mov	r13d,ebx
	add	r14d,r9d
	ror	r13d,14
	mov	r9d,r14d
	mov	r12d,ecx
	xor	r13d,ebx
	ror	r14d,9
	xor	r12d,edx
	ror	r13d,5
	xor	r14d,r9d
	and	r12d,ebx
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((48-128))+rdi]
	xor	r13d,ebx
	add	r8d,DWORD[12+rsp]
	mov	esi,r9d
	ror	r14d,11
	xor	r12d,edx
	xor	esi,r10d
	ror	r13d,6
	add	r8d,r12d
	and	r15d,esi
	xor	r14d,r9d
	add	r8d,r13d
	xor	r15d,r10d
	add	eax,r8d
	ror	r14d,2
	add	r8d,r15d
	mov	r13d,eax
	add	r14d,r8d
	vmovdqa	XMMWORD[rsp],xmm6
	vpalignr	xmm4,xmm2,xmm1,4
	ror	r13d,14
	mov	r8d,r14d
	vpalignr	xmm7,xmm0,xmm3,4
	mov	r12d,ebx
	xor	r13d,eax
DB	143,232,120,194,236,14
	ror	r14d,9
	xor	r12d,ecx
	vpsrld	xmm4,xmm4,3
	ror	r13d,5
	xor	r14d,r8d
	vpaddd	xmm1,xmm1,xmm7
	and	r12d,eax
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((64-128))+rdi]
	xor	r13d,eax
	add	edx,DWORD[16+rsp]
	mov	r15d,r8d
DB	143,232,120,194,245,11
	ror	r14d,11
	xor	r12d,ecx
	vpxor	xmm4,xmm4,xmm5
	xor	r15d,r9d
	ror	r13d,6
	add	edx,r12d
	and	esi,r15d
DB	143,232,120,194,248,13
	xor	r14d,r8d
	add	edx,r13d
	vpxor	xmm4,xmm4,xmm6
	xor	esi,r9d
	add	r11d,edx
	vpsrld	xmm6,xmm0,10
	ror	r14d,2
	add	edx,esi
	vpaddd	xmm1,xmm1,xmm4
	mov	r13d,r11d
	add	r14d,edx
DB	143,232,120,194,239,2
	ror	r13d,14
	mov	edx,r14d
	vpxor	xmm7,xmm7,xmm6
	mov	r12d,eax
	xor	r13d,r11d
	ror	r14d,9
	xor	r12d,ebx
	vpxor	xmm7,xmm7,xmm5
	ror	r13d,5
	xor	r14d,edx
	and	r12d,r11d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((80-128))+rdi]
	xor	r13d,r11d
	vpsrldq	xmm7,xmm7,8
	add	ecx,DWORD[20+rsp]
	mov	esi,edx
	ror	r14d,11
	xor	r12d,ebx
	vpaddd	xmm1,xmm1,xmm7
	xor	esi,r8d
	ror	r13d,6
	add	ecx,r12d
	and	r15d,esi
DB	143,232,120,194,249,13
	xor	r14d,edx
	add	ecx,r13d
	vpsrld	xmm6,xmm1,10
	xor	r15d,r8d
	add	r10d,ecx
DB	143,232,120,194,239,2
	ror	r14d,2
	add	ecx,r15d
	vpxor	xmm7,xmm7,xmm6
	mov	r13d,r10d
	add	r14d,ecx
	ror	r13d,14
	mov	ecx,r14d
	vpxor	xmm7,xmm7,xmm5
	mov	r12d,r11d
	xor	r13d,r10d
	ror	r14d,9
	xor	r12d,eax
	vpslldq	xmm7,xmm7,8
	ror	r13d,5
	xor	r14d,ecx
	and	r12d,r10d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((96-128))+rdi]
	xor	r13d,r10d
	vpaddd	xmm1,xmm1,xmm7
	add	ebx,DWORD[24+rsp]
	mov	r15d,ecx
	ror	r14d,11
	xor	r12d,eax
	vpaddd	xmm6,xmm1,XMMWORD[32+rbp]
	xor	r15d,edx
	ror	r13d,6
	add	ebx,r12d
	and	esi,r15d
	xor	r14d,ecx
	add	ebx,r13d
	xor	esi,edx
	add	r9d,ebx
	ror	r14d,2
	add	ebx,esi
	mov	r13d,r9d
	add	r14d,ebx
	ror	r13d,14
	mov	ebx,r14d
	mov	r12d,r10d
	xor	r13d,r9d
	ror	r14d,9
	xor	r12d,r11d
	ror	r13d,5
	xor	r14d,ebx
	and	r12d,r9d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((112-128))+rdi]
	xor	r13d,r9d
	add	eax,DWORD[28+rsp]
	mov	esi,ebx
	ror	r14d,11
	xor	r12d,r11d
	xor	esi,ecx
	ror	r13d,6
	add	eax,r12d
	and	r15d,esi
	xor	r14d,ebx
	add	eax,r13d
	xor	r15d,ecx
	add	r8d,eax
	ror	r14d,2
	add	eax,r15d
	mov	r13d,r8d
	add	r14d,eax
	vmovdqa	XMMWORD[16+rsp],xmm6
	vpalignr	xmm4,xmm3,xmm2,4
	ror	r13d,14
	mov	eax,r14d
	vpalignr	xmm7,xmm1,xmm0,4
	mov	r12d,r9d
	xor	r13d,r8d
DB	143,232,120,194,236,14
	ror	r14d,9
	xor	r12d,r10d
	vpsrld	xmm4,xmm4,3
	ror	r13d,5
	xor	r14d,eax
	vpaddd	xmm2,xmm2,xmm7
	and	r12d,r8d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((128-128))+rdi]
	xor	r13d,r8d
	add	r11d,DWORD[32+rsp]
	mov	r15d,eax
DB	143,232,120,194,245,11
	ror	r14d,11
	xor	r12d,r10d
	vpxor	xmm4,xmm4,xmm5
	xor	r15d,ebx
	ror	r13d,6
	add	r11d,r12d
	and	esi,r15d
DB	143,232,120,194,249,13
	xor	r14d,eax
	add	r11d,r13d
	vpxor	xmm4,xmm4,xmm6
	xor	esi,ebx
	add	edx,r11d
	vpsrld	xmm6,xmm1,10
	ror	r14d,2
	add	r11d,esi
	vpaddd	xmm2,xmm2,xmm4
	mov	r13d,edx
	add	r14d,r11d
DB	143,232,120,194,239,2
	ror	r13d,14
	mov	r11d,r14d
	vpxor	xmm7,xmm7,xmm6
	mov	r12d,r8d
	xor	r13d,edx
	ror	r14d,9
	xor	r12d,r9d
	vpxor	xmm7,xmm7,xmm5
	ror	r13d,5
	xor	r14d,r11d
	and	r12d,edx
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((144-128))+rdi]
	xor	r13d,edx
	vpsrldq	xmm7,xmm7,8
	add	r10d,DWORD[36+rsp]
	mov	esi,r11d
	ror	r14d,11
	xor	r12d,r9d
	vpaddd	xmm2,xmm2,xmm7
	xor	esi,eax
	ror	r13d,6
	add	r10d,r12d
	and	r15d,esi
DB	143,232,120,194,250,13
	xor	r14d,r11d
	add	r10d,r13d
	vpsrld	xmm6,xmm2,10
	xor	r15d,eax
	add	ecx,r10d
DB	143,232,120,194,239,2
	ror	r14d,2
	add	r10d,r15d
	vpxor	xmm7,xmm7,xmm6
	mov	r13d,ecx
	add	r14d,r10d
	ror	r13d,14
	mov	r10d,r14d
	vpxor	xmm7,xmm7,xmm5
	mov	r12d,edx
	xor	r13d,ecx
	ror	r14d,9
	xor	r12d,r8d
	vpslldq	xmm7,xmm7,8
	ror	r13d,5
	xor	r14d,r10d
	and	r12d,ecx
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((160-128))+rdi]
	xor	r13d,ecx
	vpaddd	xmm2,xmm2,xmm7
	add	r9d,DWORD[40+rsp]
	mov	r15d,r10d
	ror	r14d,11
	xor	r12d,r8d
	vpaddd	xmm6,xmm2,XMMWORD[64+rbp]
	xor	r15d,r11d
	ror	r13d,6
	add	r9d,r12d
	and	esi,r15d
	xor	r14d,r10d
	add	r9d,r13d
	xor	esi,r11d
	add	ebx,r9d
	ror	r14d,2
	add	r9d,esi
	mov	r13d,ebx
	add	r14d,r9d
	ror	r13d,14
	mov	r9d,r14d
	mov	r12d,ecx
	xor	r13d,ebx
	ror	r14d,9
	xor	r12d,edx
	ror	r13d,5
	xor	r14d,r9d
	and	r12d,ebx
	vaesenclast	xmm11,xmm9,xmm10
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((176-128))+rdi]
	xor	r13d,ebx
	add	r8d,DWORD[44+rsp]
	mov	esi,r9d
	ror	r14d,11
	xor	r12d,edx
	xor	esi,r10d
	ror	r13d,6
	add	r8d,r12d
	and	r15d,esi
	xor	r14d,r9d
	add	r8d,r13d
	xor	r15d,r10d
	add	eax,r8d
	ror	r14d,2
	add	r8d,r15d
	mov	r13d,eax
	add	r14d,r8d
	vmovdqa	XMMWORD[32+rsp],xmm6
	vpalignr	xmm4,xmm0,xmm3,4
	ror	r13d,14
	mov	r8d,r14d
	vpalignr	xmm7,xmm2,xmm1,4
	mov	r12d,ebx
	xor	r13d,eax
DB	143,232,120,194,236,14
	ror	r14d,9
	xor	r12d,ecx
	vpsrld	xmm4,xmm4,3
	ror	r13d,5
	xor	r14d,r8d
	vpaddd	xmm3,xmm3,xmm7
	and	r12d,eax
	vpand	xmm8,xmm11,xmm12
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((192-128))+rdi]
	xor	r13d,eax
	add	edx,DWORD[48+rsp]
	mov	r15d,r8d
DB	143,232,120,194,245,11
	ror	r14d,11
	xor	r12d,ecx
	vpxor	xmm4,xmm4,xmm5
	xor	r15d,r9d
	ror	r13d,6
	add	edx,r12d
	and	esi,r15d
DB	143,232,120,194,250,13
	xor	r14d,r8d
	add	edx,r13d
	vpxor	xmm4,xmm4,xmm6
	xor	esi,r9d
	add	r11d,edx
	vpsrld	xmm6,xmm2,10
	ror	r14d,2
	add	edx,esi
	vpaddd	xmm3,xmm3,xmm4
	mov	r13d,r11d
	add	r14d,edx
DB	143,232,120,194,239,2
	ror	r13d,14
	mov	edx,r14d
	vpxor	xmm7,xmm7,xmm6
	mov	r12d,eax
	xor	r13d,r11d
	ror	r14d,9
	xor	r12d,ebx
	vpxor	xmm7,xmm7,xmm5
	ror	r13d,5
	xor	r14d,edx
	and	r12d,r11d
	vaesenclast	xmm11,xmm9,xmm10
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((208-128))+rdi]
	xor	r13d,r11d
	vpsrldq	xmm7,xmm7,8
	add	ecx,DWORD[52+rsp]
	mov	esi,edx
	ror	r14d,11
	xor	r12d,ebx
	vpaddd	xmm3,xmm3,xmm7
	xor	esi,r8d
	ror	r13d,6
	add	ecx,r12d
	and	r15d,esi
DB	143,232,120,194,251,13
	xor	r14d,edx
	add	ecx,r13d
	vpsrld	xmm6,xmm3,10
	xor	r15d,r8d
	add	r10d,ecx
DB	143,232,120,194,239,2
	ror	r14d,2
	add	ecx,r15d
	vpxor	xmm7,xmm7,xmm6
	mov	r13d,r10d
	add	r14d,ecx
	ror	r13d,14
	mov	ecx,r14d
	vpxor	xmm7,xmm7,xmm5
	mov	r12d,r11d
	xor	r13d,r10d
	ror	r14d,9
	xor	r12d,eax
	vpslldq	xmm7,xmm7,8
	ror	r13d,5
	xor	r14d,ecx
	and	r12d,r10d
	vpand	xmm11,xmm11,xmm13
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((224-128))+rdi]
	xor	r13d,r10d
	vpaddd	xmm3,xmm3,xmm7
	add	ebx,DWORD[56+rsp]
	mov	r15d,ecx
	ror	r14d,11
	xor	r12d,eax
	vpaddd	xmm6,xmm3,XMMWORD[96+rbp]
	xor	r15d,edx
	ror	r13d,6
	add	ebx,r12d
	and	esi,r15d
	xor	r14d,ecx
	add	ebx,r13d
	xor	esi,edx
	add	r9d,ebx
	ror	r14d,2
	add	ebx,esi
	mov	r13d,r9d
	add	r14d,ebx
	ror	r13d,14
	mov	ebx,r14d
	mov	r12d,r10d
	xor	r13d,r9d
	ror	r14d,9
	xor	r12d,r11d
	ror	r13d,5
	xor	r14d,ebx
	and	r12d,r9d
	vpor	xmm8,xmm8,xmm11
	vaesenclast	xmm11,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((0-128))+rdi]
	xor	r13d,r9d
	add	eax,DWORD[60+rsp]
	mov	esi,ebx
	ror	r14d,11
	xor	r12d,r11d
	xor	esi,ecx
	ror	r13d,6
	add	eax,r12d
	and	r15d,esi
	xor	r14d,ebx
	add	eax,r13d
	xor	r15d,ecx
	add	r8d,eax
	ror	r14d,2
	add	eax,r15d
	mov	r13d,r8d
	add	r14d,eax
	vmovdqa	XMMWORD[48+rsp],xmm6
	mov	r12,QWORD[((64+0))+rsp]
	vpand	xmm11,xmm11,xmm14
	mov	r15,QWORD[((64+8))+rsp]
	vpor	xmm8,xmm8,xmm11
	vmovdqu	XMMWORD[r12*1+r15],xmm8
	lea	r12,[16+r12]
	cmp	BYTE[131+rbp],0
	jne	NEAR $L$xop_00_47
	vmovdqu	xmm9,XMMWORD[r12]
	mov	QWORD[((64+0))+rsp],r12
	ror	r13d,14
	mov	eax,r14d
	mov	r12d,r9d
	xor	r13d,r8d
	ror	r14d,9
	xor	r12d,r10d
	ror	r13d,5
	xor	r14d,eax
	and	r12d,r8d
	vpxor	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((16-128))+rdi]
	xor	r13d,r8d
	add	r11d,DWORD[rsp]
	mov	r15d,eax
	ror	r14d,11
	xor	r12d,r10d
	xor	r15d,ebx
	ror	r13d,6
	add	r11d,r12d
	and	esi,r15d
	xor	r14d,eax
	add	r11d,r13d
	xor	esi,ebx
	add	edx,r11d
	ror	r14d,2
	add	r11d,esi
	mov	r13d,edx
	add	r14d,r11d
	ror	r13d,14
	mov	r11d,r14d
	mov	r12d,r8d
	xor	r13d,edx
	ror	r14d,9
	xor	r12d,r9d
	ror	r13d,5
	xor	r14d,r11d
	and	r12d,edx
	vpxor	xmm9,xmm9,xmm8
	xor	r13d,edx
	add	r10d,DWORD[4+rsp]
	mov	esi,r11d
	ror	r14d,11
	xor	r12d,r9d
	xor	esi,eax
	ror	r13d,6
	add	r10d,r12d
	and	r15d,esi
	xor	r14d,r11d
	add	r10d,r13d
	xor	r15d,eax
	add	ecx,r10d
	ror	r14d,2
	add	r10d,r15d
	mov	r13d,ecx
	add	r14d,r10d
	ror	r13d,14
	mov	r10d,r14d
	mov	r12d,edx
	xor	r13d,ecx
	ror	r14d,9
	xor	r12d,r8d
	ror	r13d,5
	xor	r14d,r10d
	and	r12d,ecx
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((32-128))+rdi]
	xor	r13d,ecx
	add	r9d,DWORD[8+rsp]
	mov	r15d,r10d
	ror	r14d,11
	xor	r12d,r8d
	xor	r15d,r11d
	ror	r13d,6
	add	r9d,r12d
	and	esi,r15d
	xor	r14d,r10d
	add	r9d,r13d
	xor	esi,r11d
	add	ebx,r9d
	ror	r14d,2
	add	r9d,esi
	mov	r13d,ebx
	add	r14d,r9d
	ror	r13d,14
	mov	r9d,r14d
	mov	r12d,ecx
	xor	r13d,ebx
	ror	r14d,9
	xor	r12d,edx
	ror	r13d,5
	xor	r14d,r9d
	and	r12d,ebx
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((48-128))+rdi]
	xor	r13d,ebx
	add	r8d,DWORD[12+rsp]
	mov	esi,r9d
	ror	r14d,11
	xor	r12d,edx
	xor	esi,r10d
	ror	r13d,6
	add	r8d,r12d
	and	r15d,esi
	xor	r14d,r9d
	add	r8d,r13d
	xor	r15d,r10d
	add	eax,r8d
	ror	r14d,2
	add	r8d,r15d
	mov	r13d,eax
	add	r14d,r8d
	ror	r13d,14
	mov	r8d,r14d
	mov	r12d,ebx
	xor	r13d,eax
	ror	r14d,9
	xor	r12d,ecx
	ror	r13d,5
	xor	r14d,r8d
	and	r12d,eax
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((64-128))+rdi]
	xor	r13d,eax
	add	edx,DWORD[16+rsp]
	mov	r15d,r8d
	ror	r14d,11
	xor	r12d,ecx
	xor	r15d,r9d
	ror	r13d,6
	add	edx,r12d
	and	esi,r15d
	xor	r14d,r8d
	add	edx,r13d
	xor	esi,r9d
	add	r11d,edx
	ror	r14d,2
	add	edx,esi
	mov	r13d,r11d
	add	r14d,edx
	ror	r13d,14
	mov	edx,r14d
	mov	r12d,eax
	xor	r13d,r11d
	ror	r14d,9
	xor	r12d,ebx
	ror	r13d,5
	xor	r14d,edx
	and	r12d,r11d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((80-128))+rdi]
	xor	r13d,r11d
	add	ecx,DWORD[20+rsp]
	mov	esi,edx
	ror	r14d,11
	xor	r12d,ebx
	xor	esi,r8d
	ror	r13d,6
	add	ecx,r12d
	and	r15d,esi
	xor	r14d,edx
	add	ecx,r13d
	xor	r15d,r8d
	add	r10d,ecx
	ror	r14d,2
	add	ecx,r15d
	mov	r13d,r10d
	add	r14d,ecx
	ror	r13d,14
	mov	ecx,r14d
	mov	r12d,r11d
	xor	r13d,r10d
	ror	r14d,9
	xor	r12d,eax
	ror	r13d,5
	xor	r14d,ecx
	and	r12d,r10d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((96-128))+rdi]
	xor	r13d,r10d
	add	ebx,DWORD[24+rsp]
	mov	r15d,ecx
	ror	r14d,11
	xor	r12d,eax
	xor	r15d,edx
	ror	r13d,6
	add	ebx,r12d
	and	esi,r15d
	xor	r14d,ecx
	add	ebx,r13d
	xor	esi,edx
	add	r9d,ebx
	ror	r14d,2
	add	ebx,esi
	mov	r13d,r9d
	add	r14d,ebx
	ror	r13d,14
	mov	ebx,r14d
	mov	r12d,r10d
	xor	r13d,r9d
	ror	r14d,9
	xor	r12d,r11d
	ror	r13d,5
	xor	r14d,ebx
	and	r12d,r9d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((112-128))+rdi]
	xor	r13d,r9d
	add	eax,DWORD[28+rsp]
	mov	esi,ebx
	ror	r14d,11
	xor	r12d,r11d
	xor	esi,ecx
	ror	r13d,6
	add	eax,r12d
	and	r15d,esi
	xor	r14d,ebx
	add	eax,r13d
	xor	r15d,ecx
	add	r8d,eax
	ror	r14d,2
	add	eax,r15d
	mov	r13d,r8d
	add	r14d,eax
	ror	r13d,14
	mov	eax,r14d
	mov	r12d,r9d
	xor	r13d,r8d
	ror	r14d,9
	xor	r12d,r10d
	ror	r13d,5
	xor	r14d,eax
	and	r12d,r8d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((128-128))+rdi]
	xor	r13d,r8d
	add	r11d,DWORD[32+rsp]
	mov	r15d,eax
	ror	r14d,11
	xor	r12d,r10d
	xor	r15d,ebx
	ror	r13d,6
	add	r11d,r12d
	and	esi,r15d
	xor	r14d,eax
	add	r11d,r13d
	xor	esi,ebx
	add	edx,r11d
	ror	r14d,2
	add	r11d,esi
	mov	r13d,edx
	add	r14d,r11d
	ror	r13d,14
	mov	r11d,r14d
	mov	r12d,r8d
	xor	r13d,edx
	ror	r14d,9
	xor	r12d,r9d
	ror	r13d,5
	xor	r14d,r11d
	and	r12d,edx
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((144-128))+rdi]
	xor	r13d,edx
	add	r10d,DWORD[36+rsp]
	mov	esi,r11d
	ror	r14d,11
	xor	r12d,r9d
	xor	esi,eax
	ror	r13d,6
	add	r10d,r12d
	and	r15d,esi
	xor	r14d,r11d
	add	r10d,r13d
	xor	r15d,eax
	add	ecx,r10d
	ror	r14d,2
	add	r10d,r15d
	mov	r13d,ecx
	add	r14d,r10d
	ror	r13d,14
	mov	r10d,r14d
	mov	r12d,edx
	xor	r13d,ecx
	ror	r14d,9
	xor	r12d,r8d
	ror	r13d,5
	xor	r14d,r10d
	and	r12d,ecx
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((160-128))+rdi]
	xor	r13d,ecx
	add	r9d,DWORD[40+rsp]
	mov	r15d,r10d
	ror	r14d,11
	xor	r12d,r8d
	xor	r15d,r11d
	ror	r13d,6
	add	r9d,r12d
	and	esi,r15d
	xor	r14d,r10d
	add	r9d,r13d
	xor	esi,r11d
	add	ebx,r9d
	ror	r14d,2
	add	r9d,esi
	mov	r13d,ebx
	add	r14d,r9d
	ror	r13d,14
	mov	r9d,r14d
	mov	r12d,ecx
	xor	r13d,ebx
	ror	r14d,9
	xor	r12d,edx
	ror	r13d,5
	xor	r14d,r9d
	and	r12d,ebx
	vaesenclast	xmm11,xmm9,xmm10
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((176-128))+rdi]
	xor	r13d,ebx
	add	r8d,DWORD[44+rsp]
	mov	esi,r9d
	ror	r14d,11
	xor	r12d,edx
	xor	esi,r10d
	ror	r13d,6
	add	r8d,r12d
	and	r15d,esi
	xor	r14d,r9d
	add	r8d,r13d
	xor	r15d,r10d
	add	eax,r8d
	ror	r14d,2
	add	r8d,r15d
	mov	r13d,eax
	add	r14d,r8d
	ror	r13d,14
	mov	r8d,r14d
	mov	r12d,ebx
	xor	r13d,eax
	ror	r14d,9
	xor	r12d,ecx
	ror	r13d,5
	xor	r14d,r8d
	and	r12d,eax
	vpand	xmm8,xmm11,xmm12
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((192-128))+rdi]
	xor	r13d,eax
	add	edx,DWORD[48+rsp]
	mov	r15d,r8d
	ror	r14d,11
	xor	r12d,ecx
	xor	r15d,r9d
	ror	r13d,6
	add	edx,r12d
	and	esi,r15d
	xor	r14d,r8d
	add	edx,r13d
	xor	esi,r9d
	add	r11d,edx
	ror	r14d,2
	add	edx,esi
	mov	r13d,r11d
	add	r14d,edx
	ror	r13d,14
	mov	edx,r14d
	mov	r12d,eax
	xor	r13d,r11d
	ror	r14d,9
	xor	r12d,ebx
	ror	r13d,5
	xor	r14d,edx
	and	r12d,r11d
	vaesenclast	xmm11,xmm9,xmm10
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((208-128))+rdi]
	xor	r13d,r11d
	add	ecx,DWORD[52+rsp]
	mov	esi,edx
	ror	r14d,11
	xor	r12d,ebx
	xor	esi,r8d
	ror	r13d,6
	add	ecx,r12d
	and	r15d,esi
	xor	r14d,edx
	add	ecx,r13d
	xor	r15d,r8d
	add	r10d,ecx
	ror	r14d,2
	add	ecx,r15d
	mov	r13d,r10d
	add	r14d,ecx
	ror	r13d,14
	mov	ecx,r14d
	mov	r12d,r11d
	xor	r13d,r10d
	ror	r14d,9
	xor	r12d,eax
	ror	r13d,5
	xor	r14d,ecx
	and	r12d,r10d
	vpand	xmm11,xmm11,xmm13
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((224-128))+rdi]
	xor	r13d,r10d
	add	ebx,DWORD[56+rsp]
	mov	r15d,ecx
	ror	r14d,11
	xor	r12d,eax
	xor	r15d,edx
	ror	r13d,6
	add	ebx,r12d
	and	esi,r15d
	xor	r14d,ecx
	add	ebx,r13d
	xor	esi,edx
	add	r9d,ebx
	ror	r14d,2
	add	ebx,esi
	mov	r13d,r9d
	add	r14d,ebx
	ror	r13d,14
	mov	ebx,r14d
	mov	r12d,r10d
	xor	r13d,r9d
	ror	r14d,9
	xor	r12d,r11d
	ror	r13d,5
	xor	r14d,ebx
	and	r12d,r9d
	vpor	xmm8,xmm8,xmm11
	vaesenclast	xmm11,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((0-128))+rdi]
	xor	r13d,r9d
	add	eax,DWORD[60+rsp]
	mov	esi,ebx
	ror	r14d,11
	xor	r12d,r11d
	xor	esi,ecx
	ror	r13d,6
	add	eax,r12d
	and	r15d,esi
	xor	r14d,ebx
	add	eax,r13d
	xor	r15d,ecx
	add	r8d,eax
	ror	r14d,2
	add	eax,r15d
	mov	r13d,r8d
	add	r14d,eax
	mov	r12,QWORD[((64+0))+rsp]
	mov	r13,QWORD[((64+8))+rsp]
	mov	r15,QWORD[((64+40))+rsp]
	mov	rsi,QWORD[((64+48))+rsp]

	vpand	xmm11,xmm11,xmm14
	mov	eax,r14d
	vpor	xmm8,xmm8,xmm11
	vmovdqu	XMMWORD[r13*1+r12],xmm8
	lea	r12,[16+r12]

	add	eax,DWORD[r15]
	add	ebx,DWORD[4+r15]
	add	ecx,DWORD[8+r15]
	add	edx,DWORD[12+r15]
	add	r8d,DWORD[16+r15]
	add	r9d,DWORD[20+r15]
	add	r10d,DWORD[24+r15]
	add	r11d,DWORD[28+r15]

	cmp	r12,QWORD[((64+16))+rsp]

	mov	DWORD[r15],eax
	mov	DWORD[4+r15],ebx
	mov	DWORD[8+r15],ecx
	mov	DWORD[12+r15],edx
	mov	DWORD[16+r15],r8d
	mov	DWORD[20+r15],r9d
	mov	DWORD[24+r15],r10d
	mov	DWORD[28+r15],r11d

	jb	NEAR $L$loop_xop

	mov	r8,QWORD[((64+32))+rsp]
	mov	rsi,QWORD[((64+56))+rsp]
	vmovdqu	XMMWORD[r8],xmm8
	vzeroall
	movaps	xmm6,XMMWORD[128+rsp]
	movaps	xmm7,XMMWORD[144+rsp]
	movaps	xmm8,XMMWORD[160+rsp]
	movaps	xmm9,XMMWORD[176+rsp]
	movaps	xmm10,XMMWORD[192+rsp]
	movaps	xmm11,XMMWORD[208+rsp]
	movaps	xmm12,XMMWORD[224+rsp]
	movaps	xmm13,XMMWORD[240+rsp]
	movaps	xmm14,XMMWORD[256+rsp]
	movaps	xmm15,XMMWORD[272+rsp]
	mov	r15,QWORD[rsi]
	mov	r14,QWORD[8+rsi]
	mov	r13,QWORD[16+rsi]
	mov	r12,QWORD[24+rsi]
	mov	rbp,QWORD[32+rsi]
	mov	rbx,QWORD[40+rsi]
	lea	rsp,[48+rsi]
$L$epilogue_xop:
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	DB	0F3h,0C3h		;repret
$L$SEH_end_aesni_cbc_sha256_enc_xop:

ALIGN	64
aesni_cbc_sha256_enc_avx:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aesni_cbc_sha256_enc_avx:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8
	mov	rcx,r9
	mov	r8,QWORD[40+rsp]
	mov	r9,QWORD[48+rsp]


$L$avx_shortcut:
	mov	r10,QWORD[56+rsp]
	push	rbx
	push	rbp
	push	r12
	push	r13
	push	r14
	push	r15
	mov	r11,rsp
	sub	rsp,288
	and	rsp,-64

	shl	rdx,6
	sub	rsi,rdi
	sub	r10,rdi
	add	rdx,rdi


	mov	QWORD[((64+8))+rsp],rsi
	mov	QWORD[((64+16))+rsp],rdx

	mov	QWORD[((64+32))+rsp],r8
	mov	QWORD[((64+40))+rsp],r9
	mov	QWORD[((64+48))+rsp],r10
	mov	QWORD[((64+56))+rsp],r11
	movaps	XMMWORD[128+rsp],xmm6
	movaps	XMMWORD[144+rsp],xmm7
	movaps	XMMWORD[160+rsp],xmm8
	movaps	XMMWORD[176+rsp],xmm9
	movaps	XMMWORD[192+rsp],xmm10
	movaps	XMMWORD[208+rsp],xmm11
	movaps	XMMWORD[224+rsp],xmm12
	movaps	XMMWORD[240+rsp],xmm13
	movaps	XMMWORD[256+rsp],xmm14
	movaps	XMMWORD[272+rsp],xmm15
$L$prologue_avx:
	vzeroall

	mov	r12,rdi
	lea	rdi,[128+rcx]
	lea	r13,[((K256+544))]
	mov	r14d,DWORD[((240-128))+rdi]
	mov	r15,r9
	mov	rsi,r10
	vmovdqu	xmm8,XMMWORD[r8]
	sub	r14,9

	mov	eax,DWORD[r15]
	mov	ebx,DWORD[4+r15]
	mov	ecx,DWORD[8+r15]
	mov	edx,DWORD[12+r15]
	mov	r8d,DWORD[16+r15]
	mov	r9d,DWORD[20+r15]
	mov	r10d,DWORD[24+r15]
	mov	r11d,DWORD[28+r15]

	vmovdqa	xmm14,XMMWORD[r14*8+r13]
	vmovdqa	xmm13,XMMWORD[16+r14*8+r13]
	vmovdqa	xmm12,XMMWORD[32+r14*8+r13]
	vmovdqu	xmm10,XMMWORD[((0-128))+rdi]
	jmp	NEAR $L$loop_avx
ALIGN	16
$L$loop_avx:
	vmovdqa	xmm7,XMMWORD[((K256+512))]
	vmovdqu	xmm0,XMMWORD[r12*1+rsi]
	vmovdqu	xmm1,XMMWORD[16+r12*1+rsi]
	vmovdqu	xmm2,XMMWORD[32+r12*1+rsi]
	vmovdqu	xmm3,XMMWORD[48+r12*1+rsi]
	vpshufb	xmm0,xmm0,xmm7
	lea	rbp,[K256]
	vpshufb	xmm1,xmm1,xmm7
	vpshufb	xmm2,xmm2,xmm7
	vpaddd	xmm4,xmm0,XMMWORD[rbp]
	vpshufb	xmm3,xmm3,xmm7
	vpaddd	xmm5,xmm1,XMMWORD[32+rbp]
	vpaddd	xmm6,xmm2,XMMWORD[64+rbp]
	vpaddd	xmm7,xmm3,XMMWORD[96+rbp]
	vmovdqa	XMMWORD[rsp],xmm4
	mov	r14d,eax
	vmovdqa	XMMWORD[16+rsp],xmm5
	mov	esi,ebx
	vmovdqa	XMMWORD[32+rsp],xmm6
	xor	esi,ecx
	vmovdqa	XMMWORD[48+rsp],xmm7
	mov	r13d,r8d
	jmp	NEAR $L$avx_00_47

ALIGN	16
$L$avx_00_47:
	sub	rbp,-16*2*4
	vmovdqu	xmm9,XMMWORD[r12]
	mov	QWORD[((64+0))+rsp],r12
	vpalignr	xmm4,xmm1,xmm0,4
	shrd	r13d,r13d,14
	mov	eax,r14d
	mov	r12d,r9d
	vpalignr	xmm7,xmm3,xmm2,4
	xor	r13d,r8d
	shrd	r14d,r14d,9
	xor	r12d,r10d
	vpsrld	xmm6,xmm4,7
	shrd	r13d,r13d,5
	xor	r14d,eax
	and	r12d,r8d
	vpaddd	xmm0,xmm0,xmm7
	vpxor	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((16-128))+rdi]
	xor	r13d,r8d
	add	r11d,DWORD[rsp]
	mov	r15d,eax
	vpsrld	xmm7,xmm4,3
	shrd	r14d,r14d,11
	xor	r12d,r10d
	xor	r15d,ebx
	vpslld	xmm5,xmm4,14
	shrd	r13d,r13d,6
	add	r11d,r12d
	and	esi,r15d
	vpxor	xmm4,xmm7,xmm6
	xor	r14d,eax
	add	r11d,r13d
	xor	esi,ebx
	vpshufd	xmm7,xmm3,250
	add	edx,r11d
	shrd	r14d,r14d,2
	add	r11d,esi
	vpsrld	xmm6,xmm6,11
	mov	r13d,edx
	add	r14d,r11d
	shrd	r13d,r13d,14
	vpxor	xmm4,xmm4,xmm5
	mov	r11d,r14d
	mov	r12d,r8d
	xor	r13d,edx
	vpslld	xmm5,xmm5,11
	shrd	r14d,r14d,9
	xor	r12d,r9d
	shrd	r13d,r13d,5
	vpxor	xmm4,xmm4,xmm6
	xor	r14d,r11d
	and	r12d,edx
	vpxor	xmm9,xmm9,xmm8
	xor	r13d,edx
	vpsrld	xmm6,xmm7,10
	add	r10d,DWORD[4+rsp]
	mov	esi,r11d
	shrd	r14d,r14d,11
	vpxor	xmm4,xmm4,xmm5
	xor	r12d,r9d
	xor	esi,eax
	shrd	r13d,r13d,6
	vpsrlq	xmm7,xmm7,17
	add	r10d,r12d
	and	r15d,esi
	xor	r14d,r11d
	vpaddd	xmm0,xmm0,xmm4
	add	r10d,r13d
	xor	r15d,eax
	add	ecx,r10d
	vpxor	xmm6,xmm6,xmm7
	shrd	r14d,r14d,2
	add	r10d,r15d
	mov	r13d,ecx
	vpsrlq	xmm7,xmm7,2
	add	r14d,r10d
	shrd	r13d,r13d,14
	mov	r10d,r14d
	vpxor	xmm6,xmm6,xmm7
	mov	r12d,edx
	xor	r13d,ecx
	shrd	r14d,r14d,9
	vpshufd	xmm6,xmm6,132
	xor	r12d,r8d
	shrd	r13d,r13d,5
	xor	r14d,r10d
	vpsrldq	xmm6,xmm6,8
	and	r12d,ecx
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((32-128))+rdi]
	xor	r13d,ecx
	add	r9d,DWORD[8+rsp]
	vpaddd	xmm0,xmm0,xmm6
	mov	r15d,r10d
	shrd	r14d,r14d,11
	xor	r12d,r8d
	vpshufd	xmm7,xmm0,80
	xor	r15d,r11d
	shrd	r13d,r13d,6
	add	r9d,r12d
	vpsrld	xmm6,xmm7,10
	and	esi,r15d
	xor	r14d,r10d
	add	r9d,r13d
	vpsrlq	xmm7,xmm7,17
	xor	esi,r11d
	add	ebx,r9d
	shrd	r14d,r14d,2
	vpxor	xmm6,xmm6,xmm7
	add	r9d,esi
	mov	r13d,ebx
	add	r14d,r9d
	vpsrlq	xmm7,xmm7,2
	shrd	r13d,r13d,14
	mov	r9d,r14d
	mov	r12d,ecx
	vpxor	xmm6,xmm6,xmm7
	xor	r13d,ebx
	shrd	r14d,r14d,9
	xor	r12d,edx
	vpshufd	xmm6,xmm6,232
	shrd	r13d,r13d,5
	xor	r14d,r9d
	and	r12d,ebx
	vpslldq	xmm6,xmm6,8
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((48-128))+rdi]
	xor	r13d,ebx
	add	r8d,DWORD[12+rsp]
	mov	esi,r9d
	vpaddd	xmm0,xmm0,xmm6
	shrd	r14d,r14d,11
	xor	r12d,edx
	xor	esi,r10d
	vpaddd	xmm6,xmm0,XMMWORD[rbp]
	shrd	r13d,r13d,6
	add	r8d,r12d
	and	r15d,esi
	xor	r14d,r9d
	add	r8d,r13d
	xor	r15d,r10d
	add	eax,r8d
	shrd	r14d,r14d,2
	add	r8d,r15d
	mov	r13d,eax
	add	r14d,r8d
	vmovdqa	XMMWORD[rsp],xmm6
	vpalignr	xmm4,xmm2,xmm1,4
	shrd	r13d,r13d,14
	mov	r8d,r14d
	mov	r12d,ebx
	vpalignr	xmm7,xmm0,xmm3,4
	xor	r13d,eax
	shrd	r14d,r14d,9
	xor	r12d,ecx
	vpsrld	xmm6,xmm4,7
	shrd	r13d,r13d,5
	xor	r14d,r8d
	and	r12d,eax
	vpaddd	xmm1,xmm1,xmm7
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((64-128))+rdi]
	xor	r13d,eax
	add	edx,DWORD[16+rsp]
	mov	r15d,r8d
	vpsrld	xmm7,xmm4,3
	shrd	r14d,r14d,11
	xor	r12d,ecx
	xor	r15d,r9d
	vpslld	xmm5,xmm4,14
	shrd	r13d,r13d,6
	add	edx,r12d
	and	esi,r15d
	vpxor	xmm4,xmm7,xmm6
	xor	r14d,r8d
	add	edx,r13d
	xor	esi,r9d
	vpshufd	xmm7,xmm0,250
	add	r11d,edx
	shrd	r14d,r14d,2
	add	edx,esi
	vpsrld	xmm6,xmm6,11
	mov	r13d,r11d
	add	r14d,edx
	shrd	r13d,r13d,14
	vpxor	xmm4,xmm4,xmm5
	mov	edx,r14d
	mov	r12d,eax
	xor	r13d,r11d
	vpslld	xmm5,xmm5,11
	shrd	r14d,r14d,9
	xor	r12d,ebx
	shrd	r13d,r13d,5
	vpxor	xmm4,xmm4,xmm6
	xor	r14d,edx
	and	r12d,r11d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((80-128))+rdi]
	xor	r13d,r11d
	vpsrld	xmm6,xmm7,10
	add	ecx,DWORD[20+rsp]
	mov	esi,edx
	shrd	r14d,r14d,11
	vpxor	xmm4,xmm4,xmm5
	xor	r12d,ebx
	xor	esi,r8d
	shrd	r13d,r13d,6
	vpsrlq	xmm7,xmm7,17
	add	ecx,r12d
	and	r15d,esi
	xor	r14d,edx
	vpaddd	xmm1,xmm1,xmm4
	add	ecx,r13d
	xor	r15d,r8d
	add	r10d,ecx
	vpxor	xmm6,xmm6,xmm7
	shrd	r14d,r14d,2
	add	ecx,r15d
	mov	r13d,r10d
	vpsrlq	xmm7,xmm7,2
	add	r14d,ecx
	shrd	r13d,r13d,14
	mov	ecx,r14d
	vpxor	xmm6,xmm6,xmm7
	mov	r12d,r11d
	xor	r13d,r10d
	shrd	r14d,r14d,9
	vpshufd	xmm6,xmm6,132
	xor	r12d,eax
	shrd	r13d,r13d,5
	xor	r14d,ecx
	vpsrldq	xmm6,xmm6,8
	and	r12d,r10d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((96-128))+rdi]
	xor	r13d,r10d
	add	ebx,DWORD[24+rsp]
	vpaddd	xmm1,xmm1,xmm6
	mov	r15d,ecx
	shrd	r14d,r14d,11
	xor	r12d,eax
	vpshufd	xmm7,xmm1,80
	xor	r15d,edx
	shrd	r13d,r13d,6
	add	ebx,r12d
	vpsrld	xmm6,xmm7,10
	and	esi,r15d
	xor	r14d,ecx
	add	ebx,r13d
	vpsrlq	xmm7,xmm7,17
	xor	esi,edx
	add	r9d,ebx
	shrd	r14d,r14d,2
	vpxor	xmm6,xmm6,xmm7
	add	ebx,esi
	mov	r13d,r9d
	add	r14d,ebx
	vpsrlq	xmm7,xmm7,2
	shrd	r13d,r13d,14
	mov	ebx,r14d
	mov	r12d,r10d
	vpxor	xmm6,xmm6,xmm7
	xor	r13d,r9d
	shrd	r14d,r14d,9
	xor	r12d,r11d
	vpshufd	xmm6,xmm6,232
	shrd	r13d,r13d,5
	xor	r14d,ebx
	and	r12d,r9d
	vpslldq	xmm6,xmm6,8
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((112-128))+rdi]
	xor	r13d,r9d
	add	eax,DWORD[28+rsp]
	mov	esi,ebx
	vpaddd	xmm1,xmm1,xmm6
	shrd	r14d,r14d,11
	xor	r12d,r11d
	xor	esi,ecx
	vpaddd	xmm6,xmm1,XMMWORD[32+rbp]
	shrd	r13d,r13d,6
	add	eax,r12d
	and	r15d,esi
	xor	r14d,ebx
	add	eax,r13d
	xor	r15d,ecx
	add	r8d,eax
	shrd	r14d,r14d,2
	add	eax,r15d
	mov	r13d,r8d
	add	r14d,eax
	vmovdqa	XMMWORD[16+rsp],xmm6
	vpalignr	xmm4,xmm3,xmm2,4
	shrd	r13d,r13d,14
	mov	eax,r14d
	mov	r12d,r9d
	vpalignr	xmm7,xmm1,xmm0,4
	xor	r13d,r8d
	shrd	r14d,r14d,9
	xor	r12d,r10d
	vpsrld	xmm6,xmm4,7
	shrd	r13d,r13d,5
	xor	r14d,eax
	and	r12d,r8d
	vpaddd	xmm2,xmm2,xmm7
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((128-128))+rdi]
	xor	r13d,r8d
	add	r11d,DWORD[32+rsp]
	mov	r15d,eax
	vpsrld	xmm7,xmm4,3
	shrd	r14d,r14d,11
	xor	r12d,r10d
	xor	r15d,ebx
	vpslld	xmm5,xmm4,14
	shrd	r13d,r13d,6
	add	r11d,r12d
	and	esi,r15d
	vpxor	xmm4,xmm7,xmm6
	xor	r14d,eax
	add	r11d,r13d
	xor	esi,ebx
	vpshufd	xmm7,xmm1,250
	add	edx,r11d
	shrd	r14d,r14d,2
	add	r11d,esi
	vpsrld	xmm6,xmm6,11
	mov	r13d,edx
	add	r14d,r11d
	shrd	r13d,r13d,14
	vpxor	xmm4,xmm4,xmm5
	mov	r11d,r14d
	mov	r12d,r8d
	xor	r13d,edx
	vpslld	xmm5,xmm5,11
	shrd	r14d,r14d,9
	xor	r12d,r9d
	shrd	r13d,r13d,5
	vpxor	xmm4,xmm4,xmm6
	xor	r14d,r11d
	and	r12d,edx
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((144-128))+rdi]
	xor	r13d,edx
	vpsrld	xmm6,xmm7,10
	add	r10d,DWORD[36+rsp]
	mov	esi,r11d
	shrd	r14d,r14d,11
	vpxor	xmm4,xmm4,xmm5
	xor	r12d,r9d
	xor	esi,eax
	shrd	r13d,r13d,6
	vpsrlq	xmm7,xmm7,17
	add	r10d,r12d
	and	r15d,esi
	xor	r14d,r11d
	vpaddd	xmm2,xmm2,xmm4
	add	r10d,r13d
	xor	r15d,eax
	add	ecx,r10d
	vpxor	xmm6,xmm6,xmm7
	shrd	r14d,r14d,2
	add	r10d,r15d
	mov	r13d,ecx
	vpsrlq	xmm7,xmm7,2
	add	r14d,r10d
	shrd	r13d,r13d,14
	mov	r10d,r14d
	vpxor	xmm6,xmm6,xmm7
	mov	r12d,edx
	xor	r13d,ecx
	shrd	r14d,r14d,9
	vpshufd	xmm6,xmm6,132
	xor	r12d,r8d
	shrd	r13d,r13d,5
	xor	r14d,r10d
	vpsrldq	xmm6,xmm6,8
	and	r12d,ecx
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((160-128))+rdi]
	xor	r13d,ecx
	add	r9d,DWORD[40+rsp]
	vpaddd	xmm2,xmm2,xmm6
	mov	r15d,r10d
	shrd	r14d,r14d,11
	xor	r12d,r8d
	vpshufd	xmm7,xmm2,80
	xor	r15d,r11d
	shrd	r13d,r13d,6
	add	r9d,r12d
	vpsrld	xmm6,xmm7,10
	and	esi,r15d
	xor	r14d,r10d
	add	r9d,r13d
	vpsrlq	xmm7,xmm7,17
	xor	esi,r11d
	add	ebx,r9d
	shrd	r14d,r14d,2
	vpxor	xmm6,xmm6,xmm7
	add	r9d,esi
	mov	r13d,ebx
	add	r14d,r9d
	vpsrlq	xmm7,xmm7,2
	shrd	r13d,r13d,14
	mov	r9d,r14d
	mov	r12d,ecx
	vpxor	xmm6,xmm6,xmm7
	xor	r13d,ebx
	shrd	r14d,r14d,9
	xor	r12d,edx
	vpshufd	xmm6,xmm6,232
	shrd	r13d,r13d,5
	xor	r14d,r9d
	and	r12d,ebx
	vpslldq	xmm6,xmm6,8
	vaesenclast	xmm11,xmm9,xmm10
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((176-128))+rdi]
	xor	r13d,ebx
	add	r8d,DWORD[44+rsp]
	mov	esi,r9d
	vpaddd	xmm2,xmm2,xmm6
	shrd	r14d,r14d,11
	xor	r12d,edx
	xor	esi,r10d
	vpaddd	xmm6,xmm2,XMMWORD[64+rbp]
	shrd	r13d,r13d,6
	add	r8d,r12d
	and	r15d,esi
	xor	r14d,r9d
	add	r8d,r13d
	xor	r15d,r10d
	add	eax,r8d
	shrd	r14d,r14d,2
	add	r8d,r15d
	mov	r13d,eax
	add	r14d,r8d
	vmovdqa	XMMWORD[32+rsp],xmm6
	vpalignr	xmm4,xmm0,xmm3,4
	shrd	r13d,r13d,14
	mov	r8d,r14d
	mov	r12d,ebx
	vpalignr	xmm7,xmm2,xmm1,4
	xor	r13d,eax
	shrd	r14d,r14d,9
	xor	r12d,ecx
	vpsrld	xmm6,xmm4,7
	shrd	r13d,r13d,5
	xor	r14d,r8d
	and	r12d,eax
	vpaddd	xmm3,xmm3,xmm7
	vpand	xmm8,xmm11,xmm12
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((192-128))+rdi]
	xor	r13d,eax
	add	edx,DWORD[48+rsp]
	mov	r15d,r8d
	vpsrld	xmm7,xmm4,3
	shrd	r14d,r14d,11
	xor	r12d,ecx
	xor	r15d,r9d
	vpslld	xmm5,xmm4,14
	shrd	r13d,r13d,6
	add	edx,r12d
	and	esi,r15d
	vpxor	xmm4,xmm7,xmm6
	xor	r14d,r8d
	add	edx,r13d
	xor	esi,r9d
	vpshufd	xmm7,xmm2,250
	add	r11d,edx
	shrd	r14d,r14d,2
	add	edx,esi
	vpsrld	xmm6,xmm6,11
	mov	r13d,r11d
	add	r14d,edx
	shrd	r13d,r13d,14
	vpxor	xmm4,xmm4,xmm5
	mov	edx,r14d
	mov	r12d,eax
	xor	r13d,r11d
	vpslld	xmm5,xmm5,11
	shrd	r14d,r14d,9
	xor	r12d,ebx
	shrd	r13d,r13d,5
	vpxor	xmm4,xmm4,xmm6
	xor	r14d,edx
	and	r12d,r11d
	vaesenclast	xmm11,xmm9,xmm10
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((208-128))+rdi]
	xor	r13d,r11d
	vpsrld	xmm6,xmm7,10
	add	ecx,DWORD[52+rsp]
	mov	esi,edx
	shrd	r14d,r14d,11
	vpxor	xmm4,xmm4,xmm5
	xor	r12d,ebx
	xor	esi,r8d
	shrd	r13d,r13d,6
	vpsrlq	xmm7,xmm7,17
	add	ecx,r12d
	and	r15d,esi
	xor	r14d,edx
	vpaddd	xmm3,xmm3,xmm4
	add	ecx,r13d
	xor	r15d,r8d
	add	r10d,ecx
	vpxor	xmm6,xmm6,xmm7
	shrd	r14d,r14d,2
	add	ecx,r15d
	mov	r13d,r10d
	vpsrlq	xmm7,xmm7,2
	add	r14d,ecx
	shrd	r13d,r13d,14
	mov	ecx,r14d
	vpxor	xmm6,xmm6,xmm7
	mov	r12d,r11d
	xor	r13d,r10d
	shrd	r14d,r14d,9
	vpshufd	xmm6,xmm6,132
	xor	r12d,eax
	shrd	r13d,r13d,5
	xor	r14d,ecx
	vpsrldq	xmm6,xmm6,8
	and	r12d,r10d
	vpand	xmm11,xmm11,xmm13
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((224-128))+rdi]
	xor	r13d,r10d
	add	ebx,DWORD[56+rsp]
	vpaddd	xmm3,xmm3,xmm6
	mov	r15d,ecx
	shrd	r14d,r14d,11
	xor	r12d,eax
	vpshufd	xmm7,xmm3,80
	xor	r15d,edx
	shrd	r13d,r13d,6
	add	ebx,r12d
	vpsrld	xmm6,xmm7,10
	and	esi,r15d
	xor	r14d,ecx
	add	ebx,r13d
	vpsrlq	xmm7,xmm7,17
	xor	esi,edx
	add	r9d,ebx
	shrd	r14d,r14d,2
	vpxor	xmm6,xmm6,xmm7
	add	ebx,esi
	mov	r13d,r9d
	add	r14d,ebx
	vpsrlq	xmm7,xmm7,2
	shrd	r13d,r13d,14
	mov	ebx,r14d
	mov	r12d,r10d
	vpxor	xmm6,xmm6,xmm7
	xor	r13d,r9d
	shrd	r14d,r14d,9
	xor	r12d,r11d
	vpshufd	xmm6,xmm6,232
	shrd	r13d,r13d,5
	xor	r14d,ebx
	and	r12d,r9d
	vpslldq	xmm6,xmm6,8
	vpor	xmm8,xmm8,xmm11
	vaesenclast	xmm11,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((0-128))+rdi]
	xor	r13d,r9d
	add	eax,DWORD[60+rsp]
	mov	esi,ebx
	vpaddd	xmm3,xmm3,xmm6
	shrd	r14d,r14d,11
	xor	r12d,r11d
	xor	esi,ecx
	vpaddd	xmm6,xmm3,XMMWORD[96+rbp]
	shrd	r13d,r13d,6
	add	eax,r12d
	and	r15d,esi
	xor	r14d,ebx
	add	eax,r13d
	xor	r15d,ecx
	add	r8d,eax
	shrd	r14d,r14d,2
	add	eax,r15d
	mov	r13d,r8d
	add	r14d,eax
	vmovdqa	XMMWORD[48+rsp],xmm6
	mov	r12,QWORD[((64+0))+rsp]
	vpand	xmm11,xmm11,xmm14
	mov	r15,QWORD[((64+8))+rsp]
	vpor	xmm8,xmm8,xmm11
	vmovdqu	XMMWORD[r12*1+r15],xmm8
	lea	r12,[16+r12]
	cmp	BYTE[131+rbp],0
	jne	NEAR $L$avx_00_47
	vmovdqu	xmm9,XMMWORD[r12]
	mov	QWORD[((64+0))+rsp],r12
	shrd	r13d,r13d,14
	mov	eax,r14d
	mov	r12d,r9d
	xor	r13d,r8d
	shrd	r14d,r14d,9
	xor	r12d,r10d
	shrd	r13d,r13d,5
	xor	r14d,eax
	and	r12d,r8d
	vpxor	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((16-128))+rdi]
	xor	r13d,r8d
	add	r11d,DWORD[rsp]
	mov	r15d,eax
	shrd	r14d,r14d,11
	xor	r12d,r10d
	xor	r15d,ebx
	shrd	r13d,r13d,6
	add	r11d,r12d
	and	esi,r15d
	xor	r14d,eax
	add	r11d,r13d
	xor	esi,ebx
	add	edx,r11d
	shrd	r14d,r14d,2
	add	r11d,esi
	mov	r13d,edx
	add	r14d,r11d
	shrd	r13d,r13d,14
	mov	r11d,r14d
	mov	r12d,r8d
	xor	r13d,edx
	shrd	r14d,r14d,9
	xor	r12d,r9d
	shrd	r13d,r13d,5
	xor	r14d,r11d
	and	r12d,edx
	vpxor	xmm9,xmm9,xmm8
	xor	r13d,edx
	add	r10d,DWORD[4+rsp]
	mov	esi,r11d
	shrd	r14d,r14d,11
	xor	r12d,r9d
	xor	esi,eax
	shrd	r13d,r13d,6
	add	r10d,r12d
	and	r15d,esi
	xor	r14d,r11d
	add	r10d,r13d
	xor	r15d,eax
	add	ecx,r10d
	shrd	r14d,r14d,2
	add	r10d,r15d
	mov	r13d,ecx
	add	r14d,r10d
	shrd	r13d,r13d,14
	mov	r10d,r14d
	mov	r12d,edx
	xor	r13d,ecx
	shrd	r14d,r14d,9
	xor	r12d,r8d
	shrd	r13d,r13d,5
	xor	r14d,r10d
	and	r12d,ecx
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((32-128))+rdi]
	xor	r13d,ecx
	add	r9d,DWORD[8+rsp]
	mov	r15d,r10d
	shrd	r14d,r14d,11
	xor	r12d,r8d
	xor	r15d,r11d
	shrd	r13d,r13d,6
	add	r9d,r12d
	and	esi,r15d
	xor	r14d,r10d
	add	r9d,r13d
	xor	esi,r11d
	add	ebx,r9d
	shrd	r14d,r14d,2
	add	r9d,esi
	mov	r13d,ebx
	add	r14d,r9d
	shrd	r13d,r13d,14
	mov	r9d,r14d
	mov	r12d,ecx
	xor	r13d,ebx
	shrd	r14d,r14d,9
	xor	r12d,edx
	shrd	r13d,r13d,5
	xor	r14d,r9d
	and	r12d,ebx
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((48-128))+rdi]
	xor	r13d,ebx
	add	r8d,DWORD[12+rsp]
	mov	esi,r9d
	shrd	r14d,r14d,11
	xor	r12d,edx
	xor	esi,r10d
	shrd	r13d,r13d,6
	add	r8d,r12d
	and	r15d,esi
	xor	r14d,r9d
	add	r8d,r13d
	xor	r15d,r10d
	add	eax,r8d
	shrd	r14d,r14d,2
	add	r8d,r15d
	mov	r13d,eax
	add	r14d,r8d
	shrd	r13d,r13d,14
	mov	r8d,r14d
	mov	r12d,ebx
	xor	r13d,eax
	shrd	r14d,r14d,9
	xor	r12d,ecx
	shrd	r13d,r13d,5
	xor	r14d,r8d
	and	r12d,eax
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((64-128))+rdi]
	xor	r13d,eax
	add	edx,DWORD[16+rsp]
	mov	r15d,r8d
	shrd	r14d,r14d,11
	xor	r12d,ecx
	xor	r15d,r9d
	shrd	r13d,r13d,6
	add	edx,r12d
	and	esi,r15d
	xor	r14d,r8d
	add	edx,r13d
	xor	esi,r9d
	add	r11d,edx
	shrd	r14d,r14d,2
	add	edx,esi
	mov	r13d,r11d
	add	r14d,edx
	shrd	r13d,r13d,14
	mov	edx,r14d
	mov	r12d,eax
	xor	r13d,r11d
	shrd	r14d,r14d,9
	xor	r12d,ebx
	shrd	r13d,r13d,5
	xor	r14d,edx
	and	r12d,r11d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((80-128))+rdi]
	xor	r13d,r11d
	add	ecx,DWORD[20+rsp]
	mov	esi,edx
	shrd	r14d,r14d,11
	xor	r12d,ebx
	xor	esi,r8d
	shrd	r13d,r13d,6
	add	ecx,r12d
	and	r15d,esi
	xor	r14d,edx
	add	ecx,r13d
	xor	r15d,r8d
	add	r10d,ecx
	shrd	r14d,r14d,2
	add	ecx,r15d
	mov	r13d,r10d
	add	r14d,ecx
	shrd	r13d,r13d,14
	mov	ecx,r14d
	mov	r12d,r11d
	xor	r13d,r10d
	shrd	r14d,r14d,9
	xor	r12d,eax
	shrd	r13d,r13d,5
	xor	r14d,ecx
	and	r12d,r10d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((96-128))+rdi]
	xor	r13d,r10d
	add	ebx,DWORD[24+rsp]
	mov	r15d,ecx
	shrd	r14d,r14d,11
	xor	r12d,eax
	xor	r15d,edx
	shrd	r13d,r13d,6
	add	ebx,r12d
	and	esi,r15d
	xor	r14d,ecx
	add	ebx,r13d
	xor	esi,edx
	add	r9d,ebx
	shrd	r14d,r14d,2
	add	ebx,esi
	mov	r13d,r9d
	add	r14d,ebx
	shrd	r13d,r13d,14
	mov	ebx,r14d
	mov	r12d,r10d
	xor	r13d,r9d
	shrd	r14d,r14d,9
	xor	r12d,r11d
	shrd	r13d,r13d,5
	xor	r14d,ebx
	and	r12d,r9d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((112-128))+rdi]
	xor	r13d,r9d
	add	eax,DWORD[28+rsp]
	mov	esi,ebx
	shrd	r14d,r14d,11
	xor	r12d,r11d
	xor	esi,ecx
	shrd	r13d,r13d,6
	add	eax,r12d
	and	r15d,esi
	xor	r14d,ebx
	add	eax,r13d
	xor	r15d,ecx
	add	r8d,eax
	shrd	r14d,r14d,2
	add	eax,r15d
	mov	r13d,r8d
	add	r14d,eax
	shrd	r13d,r13d,14
	mov	eax,r14d
	mov	r12d,r9d
	xor	r13d,r8d
	shrd	r14d,r14d,9
	xor	r12d,r10d
	shrd	r13d,r13d,5
	xor	r14d,eax
	and	r12d,r8d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((128-128))+rdi]
	xor	r13d,r8d
	add	r11d,DWORD[32+rsp]
	mov	r15d,eax
	shrd	r14d,r14d,11
	xor	r12d,r10d
	xor	r15d,ebx
	shrd	r13d,r13d,6
	add	r11d,r12d
	and	esi,r15d
	xor	r14d,eax
	add	r11d,r13d
	xor	esi,ebx
	add	edx,r11d
	shrd	r14d,r14d,2
	add	r11d,esi
	mov	r13d,edx
	add	r14d,r11d
	shrd	r13d,r13d,14
	mov	r11d,r14d
	mov	r12d,r8d
	xor	r13d,edx
	shrd	r14d,r14d,9
	xor	r12d,r9d
	shrd	r13d,r13d,5
	xor	r14d,r11d
	and	r12d,edx
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((144-128))+rdi]
	xor	r13d,edx
	add	r10d,DWORD[36+rsp]
	mov	esi,r11d
	shrd	r14d,r14d,11
	xor	r12d,r9d
	xor	esi,eax
	shrd	r13d,r13d,6
	add	r10d,r12d
	and	r15d,esi
	xor	r14d,r11d
	add	r10d,r13d
	xor	r15d,eax
	add	ecx,r10d
	shrd	r14d,r14d,2
	add	r10d,r15d
	mov	r13d,ecx
	add	r14d,r10d
	shrd	r13d,r13d,14
	mov	r10d,r14d
	mov	r12d,edx
	xor	r13d,ecx
	shrd	r14d,r14d,9
	xor	r12d,r8d
	shrd	r13d,r13d,5
	xor	r14d,r10d
	and	r12d,ecx
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((160-128))+rdi]
	xor	r13d,ecx
	add	r9d,DWORD[40+rsp]
	mov	r15d,r10d
	shrd	r14d,r14d,11
	xor	r12d,r8d
	xor	r15d,r11d
	shrd	r13d,r13d,6
	add	r9d,r12d
	and	esi,r15d
	xor	r14d,r10d
	add	r9d,r13d
	xor	esi,r11d
	add	ebx,r9d
	shrd	r14d,r14d,2
	add	r9d,esi
	mov	r13d,ebx
	add	r14d,r9d
	shrd	r13d,r13d,14
	mov	r9d,r14d
	mov	r12d,ecx
	xor	r13d,ebx
	shrd	r14d,r14d,9
	xor	r12d,edx
	shrd	r13d,r13d,5
	xor	r14d,r9d
	and	r12d,ebx
	vaesenclast	xmm11,xmm9,xmm10
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((176-128))+rdi]
	xor	r13d,ebx
	add	r8d,DWORD[44+rsp]
	mov	esi,r9d
	shrd	r14d,r14d,11
	xor	r12d,edx
	xor	esi,r10d
	shrd	r13d,r13d,6
	add	r8d,r12d
	and	r15d,esi
	xor	r14d,r9d
	add	r8d,r13d
	xor	r15d,r10d
	add	eax,r8d
	shrd	r14d,r14d,2
	add	r8d,r15d
	mov	r13d,eax
	add	r14d,r8d
	shrd	r13d,r13d,14
	mov	r8d,r14d
	mov	r12d,ebx
	xor	r13d,eax
	shrd	r14d,r14d,9
	xor	r12d,ecx
	shrd	r13d,r13d,5
	xor	r14d,r8d
	and	r12d,eax
	vpand	xmm8,xmm11,xmm12
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((192-128))+rdi]
	xor	r13d,eax
	add	edx,DWORD[48+rsp]
	mov	r15d,r8d
	shrd	r14d,r14d,11
	xor	r12d,ecx
	xor	r15d,r9d
	shrd	r13d,r13d,6
	add	edx,r12d
	and	esi,r15d
	xor	r14d,r8d
	add	edx,r13d
	xor	esi,r9d
	add	r11d,edx
	shrd	r14d,r14d,2
	add	edx,esi
	mov	r13d,r11d
	add	r14d,edx
	shrd	r13d,r13d,14
	mov	edx,r14d
	mov	r12d,eax
	xor	r13d,r11d
	shrd	r14d,r14d,9
	xor	r12d,ebx
	shrd	r13d,r13d,5
	xor	r14d,edx
	and	r12d,r11d
	vaesenclast	xmm11,xmm9,xmm10
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((208-128))+rdi]
	xor	r13d,r11d
	add	ecx,DWORD[52+rsp]
	mov	esi,edx
	shrd	r14d,r14d,11
	xor	r12d,ebx
	xor	esi,r8d
	shrd	r13d,r13d,6
	add	ecx,r12d
	and	r15d,esi
	xor	r14d,edx
	add	ecx,r13d
	xor	r15d,r8d
	add	r10d,ecx
	shrd	r14d,r14d,2
	add	ecx,r15d
	mov	r13d,r10d
	add	r14d,ecx
	shrd	r13d,r13d,14
	mov	ecx,r14d
	mov	r12d,r11d
	xor	r13d,r10d
	shrd	r14d,r14d,9
	xor	r12d,eax
	shrd	r13d,r13d,5
	xor	r14d,ecx
	and	r12d,r10d
	vpand	xmm11,xmm11,xmm13
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((224-128))+rdi]
	xor	r13d,r10d
	add	ebx,DWORD[56+rsp]
	mov	r15d,ecx
	shrd	r14d,r14d,11
	xor	r12d,eax
	xor	r15d,edx
	shrd	r13d,r13d,6
	add	ebx,r12d
	and	esi,r15d
	xor	r14d,ecx
	add	ebx,r13d
	xor	esi,edx
	add	r9d,ebx
	shrd	r14d,r14d,2
	add	ebx,esi
	mov	r13d,r9d
	add	r14d,ebx
	shrd	r13d,r13d,14
	mov	ebx,r14d
	mov	r12d,r10d
	xor	r13d,r9d
	shrd	r14d,r14d,9
	xor	r12d,r11d
	shrd	r13d,r13d,5
	xor	r14d,ebx
	and	r12d,r9d
	vpor	xmm8,xmm8,xmm11
	vaesenclast	xmm11,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((0-128))+rdi]
	xor	r13d,r9d
	add	eax,DWORD[60+rsp]
	mov	esi,ebx
	shrd	r14d,r14d,11
	xor	r12d,r11d
	xor	esi,ecx
	shrd	r13d,r13d,6
	add	eax,r12d
	and	r15d,esi
	xor	r14d,ebx
	add	eax,r13d
	xor	r15d,ecx
	add	r8d,eax
	shrd	r14d,r14d,2
	add	eax,r15d
	mov	r13d,r8d
	add	r14d,eax
	mov	r12,QWORD[((64+0))+rsp]
	mov	r13,QWORD[((64+8))+rsp]
	mov	r15,QWORD[((64+40))+rsp]
	mov	rsi,QWORD[((64+48))+rsp]

	vpand	xmm11,xmm11,xmm14
	mov	eax,r14d
	vpor	xmm8,xmm8,xmm11
	vmovdqu	XMMWORD[r13*1+r12],xmm8
	lea	r12,[16+r12]

	add	eax,DWORD[r15]
	add	ebx,DWORD[4+r15]
	add	ecx,DWORD[8+r15]
	add	edx,DWORD[12+r15]
	add	r8d,DWORD[16+r15]
	add	r9d,DWORD[20+r15]
	add	r10d,DWORD[24+r15]
	add	r11d,DWORD[28+r15]

	cmp	r12,QWORD[((64+16))+rsp]

	mov	DWORD[r15],eax
	mov	DWORD[4+r15],ebx
	mov	DWORD[8+r15],ecx
	mov	DWORD[12+r15],edx
	mov	DWORD[16+r15],r8d
	mov	DWORD[20+r15],r9d
	mov	DWORD[24+r15],r10d
	mov	DWORD[28+r15],r11d
	jb	NEAR $L$loop_avx

	mov	r8,QWORD[((64+32))+rsp]
	mov	rsi,QWORD[((64+56))+rsp]
	vmovdqu	XMMWORD[r8],xmm8
	vzeroall
	movaps	xmm6,XMMWORD[128+rsp]
	movaps	xmm7,XMMWORD[144+rsp]
	movaps	xmm8,XMMWORD[160+rsp]
	movaps	xmm9,XMMWORD[176+rsp]
	movaps	xmm10,XMMWORD[192+rsp]
	movaps	xmm11,XMMWORD[208+rsp]
	movaps	xmm12,XMMWORD[224+rsp]
	movaps	xmm13,XMMWORD[240+rsp]
	movaps	xmm14,XMMWORD[256+rsp]
	movaps	xmm15,XMMWORD[272+rsp]
	mov	r15,QWORD[rsi]
	mov	r14,QWORD[8+rsi]
	mov	r13,QWORD[16+rsi]
	mov	r12,QWORD[24+rsi]
	mov	rbp,QWORD[32+rsi]
	mov	rbx,QWORD[40+rsi]
	lea	rsp,[48+rsi]
$L$epilogue_avx:
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	DB	0F3h,0C3h		;repret
$L$SEH_end_aesni_cbc_sha256_enc_avx:

ALIGN	64
aesni_cbc_sha256_enc_avx2:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aesni_cbc_sha256_enc_avx2:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8
	mov	rcx,r9
	mov	r8,QWORD[40+rsp]
	mov	r9,QWORD[48+rsp]


$L$avx2_shortcut:
	mov	r10,QWORD[56+rsp]
	push	rbx
	push	rbp
	push	r12
	push	r13
	push	r14
	push	r15
	mov	r11,rsp
	sub	rsp,736
	and	rsp,-256*4
	add	rsp,448

	shl	rdx,6
	sub	rsi,rdi
	sub	r10,rdi
	add	rdx,rdi



	mov	QWORD[((64+16))+rsp],rdx

	mov	QWORD[((64+32))+rsp],r8
	mov	QWORD[((64+40))+rsp],r9
	mov	QWORD[((64+48))+rsp],r10
	mov	QWORD[((64+56))+rsp],r11
	movaps	XMMWORD[128+rsp],xmm6
	movaps	XMMWORD[144+rsp],xmm7
	movaps	XMMWORD[160+rsp],xmm8
	movaps	XMMWORD[176+rsp],xmm9
	movaps	XMMWORD[192+rsp],xmm10
	movaps	XMMWORD[208+rsp],xmm11
	movaps	XMMWORD[224+rsp],xmm12
	movaps	XMMWORD[240+rsp],xmm13
	movaps	XMMWORD[256+rsp],xmm14
	movaps	XMMWORD[272+rsp],xmm15
$L$prologue_avx2:
	vzeroall

	mov	r13,rdi
	vpinsrq	xmm15,xmm15,rsi,1
	lea	rdi,[128+rcx]
	lea	r12,[((K256+544))]
	mov	r14d,DWORD[((240-128))+rdi]
	mov	r15,r9
	mov	rsi,r10
	vmovdqu	xmm8,XMMWORD[r8]
	lea	r14,[((-9))+r14]

	vmovdqa	xmm14,XMMWORD[r14*8+r12]
	vmovdqa	xmm13,XMMWORD[16+r14*8+r12]
	vmovdqa	xmm12,XMMWORD[32+r14*8+r12]

	sub	r13,-16*4
	mov	eax,DWORD[r15]
	lea	r12,[r13*1+rsi]
	mov	ebx,DWORD[4+r15]
	cmp	r13,rdx
	mov	ecx,DWORD[8+r15]
	cmove	r12,rsp
	mov	edx,DWORD[12+r15]
	mov	r8d,DWORD[16+r15]
	mov	r9d,DWORD[20+r15]
	mov	r10d,DWORD[24+r15]
	mov	r11d,DWORD[28+r15]
	vmovdqu	xmm10,XMMWORD[((0-128))+rdi]
	jmp	NEAR $L$oop_avx2
ALIGN	16
$L$oop_avx2:
	vmovdqa	ymm7,YMMWORD[((K256+512))]
	vmovdqu	xmm0,XMMWORD[((-64+0))+r13*1+rsi]
	vmovdqu	xmm1,XMMWORD[((-64+16))+r13*1+rsi]
	vmovdqu	xmm2,XMMWORD[((-64+32))+r13*1+rsi]
	vmovdqu	xmm3,XMMWORD[((-64+48))+r13*1+rsi]

	vinserti128	ymm0,ymm0,XMMWORD[r12],1
	vinserti128	ymm1,ymm1,XMMWORD[16+r12],1
	vpshufb	ymm0,ymm0,ymm7
	vinserti128	ymm2,ymm2,XMMWORD[32+r12],1
	vpshufb	ymm1,ymm1,ymm7
	vinserti128	ymm3,ymm3,XMMWORD[48+r12],1

	lea	rbp,[K256]
	vpshufb	ymm2,ymm2,ymm7
	lea	r13,[((-64))+r13]
	vpaddd	ymm4,ymm0,YMMWORD[rbp]
	vpshufb	ymm3,ymm3,ymm7
	vpaddd	ymm5,ymm1,YMMWORD[32+rbp]
	vpaddd	ymm6,ymm2,YMMWORD[64+rbp]
	vpaddd	ymm7,ymm3,YMMWORD[96+rbp]
	vmovdqa	YMMWORD[rsp],ymm4
	xor	r14d,r14d
	vmovdqa	YMMWORD[32+rsp],ymm5
	lea	rsp,[((-64))+rsp]
	mov	esi,ebx
	vmovdqa	YMMWORD[rsp],ymm6
	xor	esi,ecx
	vmovdqa	YMMWORD[32+rsp],ymm7
	mov	r12d,r9d
	sub	rbp,-16*2*4
	jmp	NEAR $L$avx2_00_47

ALIGN	16
$L$avx2_00_47:
	vmovdqu	xmm9,XMMWORD[r13]
	vpinsrq	xmm15,xmm15,r13,0
	lea	rsp,[((-64))+rsp]
	vpalignr	ymm4,ymm1,ymm0,4
	add	r11d,DWORD[((0+128))+rsp]
	and	r12d,r8d
	rorx	r13d,r8d,25
	vpalignr	ymm7,ymm3,ymm2,4
	rorx	r15d,r8d,11
	lea	eax,[r14*1+rax]
	lea	r11d,[r12*1+r11]
	vpsrld	ymm6,ymm4,7
	andn	r12d,r8d,r10d
	xor	r13d,r15d
	rorx	r14d,r8d,6
	vpaddd	ymm0,ymm0,ymm7
	lea	r11d,[r12*1+r11]
	xor	r13d,r14d
	mov	r15d,eax
	vpsrld	ymm7,ymm4,3
	rorx	r12d,eax,22
	lea	r11d,[r13*1+r11]
	xor	r15d,ebx
	vpslld	ymm5,ymm4,14
	rorx	r14d,eax,13
	rorx	r13d,eax,2
	lea	edx,[r11*1+rdx]
	vpxor	ymm4,ymm7,ymm6
	and	esi,r15d
	vpxor	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((16-128))+rdi]
	xor	r14d,r12d
	xor	esi,ebx
	vpshufd	ymm7,ymm3,250
	xor	r14d,r13d
	lea	r11d,[rsi*1+r11]
	mov	r12d,r8d
	vpsrld	ymm6,ymm6,11
	add	r10d,DWORD[((4+128))+rsp]
	and	r12d,edx
	rorx	r13d,edx,25
	vpxor	ymm4,ymm4,ymm5
	rorx	esi,edx,11
	lea	r11d,[r14*1+r11]
	lea	r10d,[r12*1+r10]
	vpslld	ymm5,ymm5,11
	andn	r12d,edx,r9d
	xor	r13d,esi
	rorx	r14d,edx,6
	vpxor	ymm4,ymm4,ymm6
	lea	r10d,[r12*1+r10]
	xor	r13d,r14d
	mov	esi,r11d
	vpsrld	ymm6,ymm7,10
	rorx	r12d,r11d,22
	lea	r10d,[r13*1+r10]
	xor	esi,eax
	vpxor	ymm4,ymm4,ymm5
	rorx	r14d,r11d,13
	rorx	r13d,r11d,2
	lea	ecx,[r10*1+rcx]
	vpsrlq	ymm7,ymm7,17
	and	r15d,esi
	vpxor	xmm9,xmm9,xmm8
	xor	r14d,r12d
	xor	r15d,eax
	vpaddd	ymm0,ymm0,ymm4
	xor	r14d,r13d
	lea	r10d,[r15*1+r10]
	mov	r12d,edx
	vpxor	ymm6,ymm6,ymm7
	add	r9d,DWORD[((8+128))+rsp]
	and	r12d,ecx
	rorx	r13d,ecx,25
	vpsrlq	ymm7,ymm7,2
	rorx	r15d,ecx,11
	lea	r10d,[r14*1+r10]
	lea	r9d,[r12*1+r9]
	vpxor	ymm6,ymm6,ymm7
	andn	r12d,ecx,r8d
	xor	r13d,r15d
	rorx	r14d,ecx,6
	vpshufd	ymm6,ymm6,132
	lea	r9d,[r12*1+r9]
	xor	r13d,r14d
	mov	r15d,r10d
	vpsrldq	ymm6,ymm6,8
	rorx	r12d,r10d,22
	lea	r9d,[r13*1+r9]
	xor	r15d,r11d
	vpaddd	ymm0,ymm0,ymm6
	rorx	r14d,r10d,13
	rorx	r13d,r10d,2
	lea	ebx,[r9*1+rbx]
	vpshufd	ymm7,ymm0,80
	and	esi,r15d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((32-128))+rdi]
	xor	r14d,r12d
	xor	esi,r11d
	vpsrld	ymm6,ymm7,10
	xor	r14d,r13d
	lea	r9d,[rsi*1+r9]
	mov	r12d,ecx
	vpsrlq	ymm7,ymm7,17
	add	r8d,DWORD[((12+128))+rsp]
	and	r12d,ebx
	rorx	r13d,ebx,25
	vpxor	ymm6,ymm6,ymm7
	rorx	esi,ebx,11
	lea	r9d,[r14*1+r9]
	lea	r8d,[r12*1+r8]
	vpsrlq	ymm7,ymm7,2
	andn	r12d,ebx,edx
	xor	r13d,esi
	rorx	r14d,ebx,6
	vpxor	ymm6,ymm6,ymm7
	lea	r8d,[r12*1+r8]
	xor	r13d,r14d
	mov	esi,r9d
	vpshufd	ymm6,ymm6,232
	rorx	r12d,r9d,22
	lea	r8d,[r13*1+r8]
	xor	esi,r10d
	vpslldq	ymm6,ymm6,8
	rorx	r14d,r9d,13
	rorx	r13d,r9d,2
	lea	eax,[r8*1+rax]
	vpaddd	ymm0,ymm0,ymm6
	and	r15d,esi
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((48-128))+rdi]
	xor	r14d,r12d
	xor	r15d,r10d
	vpaddd	ymm6,ymm0,YMMWORD[rbp]
	xor	r14d,r13d
	lea	r8d,[r15*1+r8]
	mov	r12d,ebx
	vmovdqa	YMMWORD[rsp],ymm6
	vpalignr	ymm4,ymm2,ymm1,4
	add	edx,DWORD[((32+128))+rsp]
	and	r12d,eax
	rorx	r13d,eax,25
	vpalignr	ymm7,ymm0,ymm3,4
	rorx	r15d,eax,11
	lea	r8d,[r14*1+r8]
	lea	edx,[r12*1+rdx]
	vpsrld	ymm6,ymm4,7
	andn	r12d,eax,ecx
	xor	r13d,r15d
	rorx	r14d,eax,6
	vpaddd	ymm1,ymm1,ymm7
	lea	edx,[r12*1+rdx]
	xor	r13d,r14d
	mov	r15d,r8d
	vpsrld	ymm7,ymm4,3
	rorx	r12d,r8d,22
	lea	edx,[r13*1+rdx]
	xor	r15d,r9d
	vpslld	ymm5,ymm4,14
	rorx	r14d,r8d,13
	rorx	r13d,r8d,2
	lea	r11d,[rdx*1+r11]
	vpxor	ymm4,ymm7,ymm6
	and	esi,r15d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((64-128))+rdi]
	xor	r14d,r12d
	xor	esi,r9d
	vpshufd	ymm7,ymm0,250
	xor	r14d,r13d
	lea	edx,[rsi*1+rdx]
	mov	r12d,eax
	vpsrld	ymm6,ymm6,11
	add	ecx,DWORD[((36+128))+rsp]
	and	r12d,r11d
	rorx	r13d,r11d,25
	vpxor	ymm4,ymm4,ymm5
	rorx	esi,r11d,11
	lea	edx,[r14*1+rdx]
	lea	ecx,[r12*1+rcx]
	vpslld	ymm5,ymm5,11
	andn	r12d,r11d,ebx
	xor	r13d,esi
	rorx	r14d,r11d,6
	vpxor	ymm4,ymm4,ymm6
	lea	ecx,[r12*1+rcx]
	xor	r13d,r14d
	mov	esi,edx
	vpsrld	ymm6,ymm7,10
	rorx	r12d,edx,22
	lea	ecx,[r13*1+rcx]
	xor	esi,r8d
	vpxor	ymm4,ymm4,ymm5
	rorx	r14d,edx,13
	rorx	r13d,edx,2
	lea	r10d,[rcx*1+r10]
	vpsrlq	ymm7,ymm7,17
	and	r15d,esi
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((80-128))+rdi]
	xor	r14d,r12d
	xor	r15d,r8d
	vpaddd	ymm1,ymm1,ymm4
	xor	r14d,r13d
	lea	ecx,[r15*1+rcx]
	mov	r12d,r11d
	vpxor	ymm6,ymm6,ymm7
	add	ebx,DWORD[((40+128))+rsp]
	and	r12d,r10d
	rorx	r13d,r10d,25
	vpsrlq	ymm7,ymm7,2
	rorx	r15d,r10d,11
	lea	ecx,[r14*1+rcx]
	lea	ebx,[r12*1+rbx]
	vpxor	ymm6,ymm6,ymm7
	andn	r12d,r10d,eax
	xor	r13d,r15d
	rorx	r14d,r10d,6
	vpshufd	ymm6,ymm6,132
	lea	ebx,[r12*1+rbx]
	xor	r13d,r14d
	mov	r15d,ecx
	vpsrldq	ymm6,ymm6,8
	rorx	r12d,ecx,22
	lea	ebx,[r13*1+rbx]
	xor	r15d,edx
	vpaddd	ymm1,ymm1,ymm6
	rorx	r14d,ecx,13
	rorx	r13d,ecx,2
	lea	r9d,[rbx*1+r9]
	vpshufd	ymm7,ymm1,80
	and	esi,r15d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((96-128))+rdi]
	xor	r14d,r12d
	xor	esi,edx
	vpsrld	ymm6,ymm7,10
	xor	r14d,r13d
	lea	ebx,[rsi*1+rbx]
	mov	r12d,r10d
	vpsrlq	ymm7,ymm7,17
	add	eax,DWORD[((44+128))+rsp]
	and	r12d,r9d
	rorx	r13d,r9d,25
	vpxor	ymm6,ymm6,ymm7
	rorx	esi,r9d,11
	lea	ebx,[r14*1+rbx]
	lea	eax,[r12*1+rax]
	vpsrlq	ymm7,ymm7,2
	andn	r12d,r9d,r11d
	xor	r13d,esi
	rorx	r14d,r9d,6
	vpxor	ymm6,ymm6,ymm7
	lea	eax,[r12*1+rax]
	xor	r13d,r14d
	mov	esi,ebx
	vpshufd	ymm6,ymm6,232
	rorx	r12d,ebx,22
	lea	eax,[r13*1+rax]
	xor	esi,ecx
	vpslldq	ymm6,ymm6,8
	rorx	r14d,ebx,13
	rorx	r13d,ebx,2
	lea	r8d,[rax*1+r8]
	vpaddd	ymm1,ymm1,ymm6
	and	r15d,esi
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((112-128))+rdi]
	xor	r14d,r12d
	xor	r15d,ecx
	vpaddd	ymm6,ymm1,YMMWORD[32+rbp]
	xor	r14d,r13d
	lea	eax,[r15*1+rax]
	mov	r12d,r9d
	vmovdqa	YMMWORD[32+rsp],ymm6
	lea	rsp,[((-64))+rsp]
	vpalignr	ymm4,ymm3,ymm2,4
	add	r11d,DWORD[((0+128))+rsp]
	and	r12d,r8d
	rorx	r13d,r8d,25
	vpalignr	ymm7,ymm1,ymm0,4
	rorx	r15d,r8d,11
	lea	eax,[r14*1+rax]
	lea	r11d,[r12*1+r11]
	vpsrld	ymm6,ymm4,7
	andn	r12d,r8d,r10d
	xor	r13d,r15d
	rorx	r14d,r8d,6
	vpaddd	ymm2,ymm2,ymm7
	lea	r11d,[r12*1+r11]
	xor	r13d,r14d
	mov	r15d,eax
	vpsrld	ymm7,ymm4,3
	rorx	r12d,eax,22
	lea	r11d,[r13*1+r11]
	xor	r15d,ebx
	vpslld	ymm5,ymm4,14
	rorx	r14d,eax,13
	rorx	r13d,eax,2
	lea	edx,[r11*1+rdx]
	vpxor	ymm4,ymm7,ymm6
	and	esi,r15d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((128-128))+rdi]
	xor	r14d,r12d
	xor	esi,ebx
	vpshufd	ymm7,ymm1,250
	xor	r14d,r13d
	lea	r11d,[rsi*1+r11]
	mov	r12d,r8d
	vpsrld	ymm6,ymm6,11
	add	r10d,DWORD[((4+128))+rsp]
	and	r12d,edx
	rorx	r13d,edx,25
	vpxor	ymm4,ymm4,ymm5
	rorx	esi,edx,11
	lea	r11d,[r14*1+r11]
	lea	r10d,[r12*1+r10]
	vpslld	ymm5,ymm5,11
	andn	r12d,edx,r9d
	xor	r13d,esi
	rorx	r14d,edx,6
	vpxor	ymm4,ymm4,ymm6
	lea	r10d,[r12*1+r10]
	xor	r13d,r14d
	mov	esi,r11d
	vpsrld	ymm6,ymm7,10
	rorx	r12d,r11d,22
	lea	r10d,[r13*1+r10]
	xor	esi,eax
	vpxor	ymm4,ymm4,ymm5
	rorx	r14d,r11d,13
	rorx	r13d,r11d,2
	lea	ecx,[r10*1+rcx]
	vpsrlq	ymm7,ymm7,17
	and	r15d,esi
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((144-128))+rdi]
	xor	r14d,r12d
	xor	r15d,eax
	vpaddd	ymm2,ymm2,ymm4
	xor	r14d,r13d
	lea	r10d,[r15*1+r10]
	mov	r12d,edx
	vpxor	ymm6,ymm6,ymm7
	add	r9d,DWORD[((8+128))+rsp]
	and	r12d,ecx
	rorx	r13d,ecx,25
	vpsrlq	ymm7,ymm7,2
	rorx	r15d,ecx,11
	lea	r10d,[r14*1+r10]
	lea	r9d,[r12*1+r9]
	vpxor	ymm6,ymm6,ymm7
	andn	r12d,ecx,r8d
	xor	r13d,r15d
	rorx	r14d,ecx,6
	vpshufd	ymm6,ymm6,132
	lea	r9d,[r12*1+r9]
	xor	r13d,r14d
	mov	r15d,r10d
	vpsrldq	ymm6,ymm6,8
	rorx	r12d,r10d,22
	lea	r9d,[r13*1+r9]
	xor	r15d,r11d
	vpaddd	ymm2,ymm2,ymm6
	rorx	r14d,r10d,13
	rorx	r13d,r10d,2
	lea	ebx,[r9*1+rbx]
	vpshufd	ymm7,ymm2,80
	and	esi,r15d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((160-128))+rdi]
	xor	r14d,r12d
	xor	esi,r11d
	vpsrld	ymm6,ymm7,10
	xor	r14d,r13d
	lea	r9d,[rsi*1+r9]
	mov	r12d,ecx
	vpsrlq	ymm7,ymm7,17
	add	r8d,DWORD[((12+128))+rsp]
	and	r12d,ebx
	rorx	r13d,ebx,25
	vpxor	ymm6,ymm6,ymm7
	rorx	esi,ebx,11
	lea	r9d,[r14*1+r9]
	lea	r8d,[r12*1+r8]
	vpsrlq	ymm7,ymm7,2
	andn	r12d,ebx,edx
	xor	r13d,esi
	rorx	r14d,ebx,6
	vpxor	ymm6,ymm6,ymm7
	lea	r8d,[r12*1+r8]
	xor	r13d,r14d
	mov	esi,r9d
	vpshufd	ymm6,ymm6,232
	rorx	r12d,r9d,22
	lea	r8d,[r13*1+r8]
	xor	esi,r10d
	vpslldq	ymm6,ymm6,8
	rorx	r14d,r9d,13
	rorx	r13d,r9d,2
	lea	eax,[r8*1+rax]
	vpaddd	ymm2,ymm2,ymm6
	and	r15d,esi
	vaesenclast	xmm11,xmm9,xmm10
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((176-128))+rdi]
	xor	r14d,r12d
	xor	r15d,r10d
	vpaddd	ymm6,ymm2,YMMWORD[64+rbp]
	xor	r14d,r13d
	lea	r8d,[r15*1+r8]
	mov	r12d,ebx
	vmovdqa	YMMWORD[rsp],ymm6
	vpalignr	ymm4,ymm0,ymm3,4
	add	edx,DWORD[((32+128))+rsp]
	and	r12d,eax
	rorx	r13d,eax,25
	vpalignr	ymm7,ymm2,ymm1,4
	rorx	r15d,eax,11
	lea	r8d,[r14*1+r8]
	lea	edx,[r12*1+rdx]
	vpsrld	ymm6,ymm4,7
	andn	r12d,eax,ecx
	xor	r13d,r15d
	rorx	r14d,eax,6
	vpaddd	ymm3,ymm3,ymm7
	lea	edx,[r12*1+rdx]
	xor	r13d,r14d
	mov	r15d,r8d
	vpsrld	ymm7,ymm4,3
	rorx	r12d,r8d,22
	lea	edx,[r13*1+rdx]
	xor	r15d,r9d
	vpslld	ymm5,ymm4,14
	rorx	r14d,r8d,13
	rorx	r13d,r8d,2
	lea	r11d,[rdx*1+r11]
	vpxor	ymm4,ymm7,ymm6
	and	esi,r15d
	vpand	xmm8,xmm11,xmm12
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((192-128))+rdi]
	xor	r14d,r12d
	xor	esi,r9d
	vpshufd	ymm7,ymm2,250
	xor	r14d,r13d
	lea	edx,[rsi*1+rdx]
	mov	r12d,eax
	vpsrld	ymm6,ymm6,11
	add	ecx,DWORD[((36+128))+rsp]
	and	r12d,r11d
	rorx	r13d,r11d,25
	vpxor	ymm4,ymm4,ymm5
	rorx	esi,r11d,11
	lea	edx,[r14*1+rdx]
	lea	ecx,[r12*1+rcx]
	vpslld	ymm5,ymm5,11
	andn	r12d,r11d,ebx
	xor	r13d,esi
	rorx	r14d,r11d,6
	vpxor	ymm4,ymm4,ymm6
	lea	ecx,[r12*1+rcx]
	xor	r13d,r14d
	mov	esi,edx
	vpsrld	ymm6,ymm7,10
	rorx	r12d,edx,22
	lea	ecx,[r13*1+rcx]
	xor	esi,r8d
	vpxor	ymm4,ymm4,ymm5
	rorx	r14d,edx,13
	rorx	r13d,edx,2
	lea	r10d,[rcx*1+r10]
	vpsrlq	ymm7,ymm7,17
	and	r15d,esi
	vaesenclast	xmm11,xmm9,xmm10
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((208-128))+rdi]
	xor	r14d,r12d
	xor	r15d,r8d
	vpaddd	ymm3,ymm3,ymm4
	xor	r14d,r13d
	lea	ecx,[r15*1+rcx]
	mov	r12d,r11d
	vpxor	ymm6,ymm6,ymm7
	add	ebx,DWORD[((40+128))+rsp]
	and	r12d,r10d
	rorx	r13d,r10d,25
	vpsrlq	ymm7,ymm7,2
	rorx	r15d,r10d,11
	lea	ecx,[r14*1+rcx]
	lea	ebx,[r12*1+rbx]
	vpxor	ymm6,ymm6,ymm7
	andn	r12d,r10d,eax
	xor	r13d,r15d
	rorx	r14d,r10d,6
	vpshufd	ymm6,ymm6,132
	lea	ebx,[r12*1+rbx]
	xor	r13d,r14d
	mov	r15d,ecx
	vpsrldq	ymm6,ymm6,8
	rorx	r12d,ecx,22
	lea	ebx,[r13*1+rbx]
	xor	r15d,edx
	vpaddd	ymm3,ymm3,ymm6
	rorx	r14d,ecx,13
	rorx	r13d,ecx,2
	lea	r9d,[rbx*1+r9]
	vpshufd	ymm7,ymm3,80
	and	esi,r15d
	vpand	xmm11,xmm11,xmm13
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((224-128))+rdi]
	xor	r14d,r12d
	xor	esi,edx
	vpsrld	ymm6,ymm7,10
	xor	r14d,r13d
	lea	ebx,[rsi*1+rbx]
	mov	r12d,r10d
	vpsrlq	ymm7,ymm7,17
	add	eax,DWORD[((44+128))+rsp]
	and	r12d,r9d
	rorx	r13d,r9d,25
	vpxor	ymm6,ymm6,ymm7
	rorx	esi,r9d,11
	lea	ebx,[r14*1+rbx]
	lea	eax,[r12*1+rax]
	vpsrlq	ymm7,ymm7,2
	andn	r12d,r9d,r11d
	xor	r13d,esi
	rorx	r14d,r9d,6
	vpxor	ymm6,ymm6,ymm7
	lea	eax,[r12*1+rax]
	xor	r13d,r14d
	mov	esi,ebx
	vpshufd	ymm6,ymm6,232
	rorx	r12d,ebx,22
	lea	eax,[r13*1+rax]
	xor	esi,ecx
	vpslldq	ymm6,ymm6,8
	rorx	r14d,ebx,13
	rorx	r13d,ebx,2
	lea	r8d,[rax*1+r8]
	vpaddd	ymm3,ymm3,ymm6
	and	r15d,esi
	vpor	xmm8,xmm8,xmm11
	vaesenclast	xmm11,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((0-128))+rdi]
	xor	r14d,r12d
	xor	r15d,ecx
	vpaddd	ymm6,ymm3,YMMWORD[96+rbp]
	xor	r14d,r13d
	lea	eax,[r15*1+rax]
	mov	r12d,r9d
	vmovdqa	YMMWORD[32+rsp],ymm6
	vmovq	r13,xmm15
	vpextrq	r15,xmm15,1
	vpand	xmm11,xmm11,xmm14
	vpor	xmm8,xmm8,xmm11
	vmovdqu	XMMWORD[r13*1+r15],xmm8
	lea	r13,[16+r13]
	lea	rbp,[128+rbp]
	cmp	BYTE[3+rbp],0
	jne	NEAR $L$avx2_00_47
	vmovdqu	xmm9,XMMWORD[r13]
	vpinsrq	xmm15,xmm15,r13,0
	add	r11d,DWORD[((0+64))+rsp]
	and	r12d,r8d
	rorx	r13d,r8d,25
	rorx	r15d,r8d,11
	lea	eax,[r14*1+rax]
	lea	r11d,[r12*1+r11]
	andn	r12d,r8d,r10d
	xor	r13d,r15d
	rorx	r14d,r8d,6
	lea	r11d,[r12*1+r11]
	xor	r13d,r14d
	mov	r15d,eax
	rorx	r12d,eax,22
	lea	r11d,[r13*1+r11]
	xor	r15d,ebx
	rorx	r14d,eax,13
	rorx	r13d,eax,2
	lea	edx,[r11*1+rdx]
	and	esi,r15d
	vpxor	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((16-128))+rdi]
	xor	r14d,r12d
	xor	esi,ebx
	xor	r14d,r13d
	lea	r11d,[rsi*1+r11]
	mov	r12d,r8d
	add	r10d,DWORD[((4+64))+rsp]
	and	r12d,edx
	rorx	r13d,edx,25
	rorx	esi,edx,11
	lea	r11d,[r14*1+r11]
	lea	r10d,[r12*1+r10]
	andn	r12d,edx,r9d
	xor	r13d,esi
	rorx	r14d,edx,6
	lea	r10d,[r12*1+r10]
	xor	r13d,r14d
	mov	esi,r11d
	rorx	r12d,r11d,22
	lea	r10d,[r13*1+r10]
	xor	esi,eax
	rorx	r14d,r11d,13
	rorx	r13d,r11d,2
	lea	ecx,[r10*1+rcx]
	and	r15d,esi
	vpxor	xmm9,xmm9,xmm8
	xor	r14d,r12d
	xor	r15d,eax
	xor	r14d,r13d
	lea	r10d,[r15*1+r10]
	mov	r12d,edx
	add	r9d,DWORD[((8+64))+rsp]
	and	r12d,ecx
	rorx	r13d,ecx,25
	rorx	r15d,ecx,11
	lea	r10d,[r14*1+r10]
	lea	r9d,[r12*1+r9]
	andn	r12d,ecx,r8d
	xor	r13d,r15d
	rorx	r14d,ecx,6
	lea	r9d,[r12*1+r9]
	xor	r13d,r14d
	mov	r15d,r10d
	rorx	r12d,r10d,22
	lea	r9d,[r13*1+r9]
	xor	r15d,r11d
	rorx	r14d,r10d,13
	rorx	r13d,r10d,2
	lea	ebx,[r9*1+rbx]
	and	esi,r15d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((32-128))+rdi]
	xor	r14d,r12d
	xor	esi,r11d
	xor	r14d,r13d
	lea	r9d,[rsi*1+r9]
	mov	r12d,ecx
	add	r8d,DWORD[((12+64))+rsp]
	and	r12d,ebx
	rorx	r13d,ebx,25
	rorx	esi,ebx,11
	lea	r9d,[r14*1+r9]
	lea	r8d,[r12*1+r8]
	andn	r12d,ebx,edx
	xor	r13d,esi
	rorx	r14d,ebx,6
	lea	r8d,[r12*1+r8]
	xor	r13d,r14d
	mov	esi,r9d
	rorx	r12d,r9d,22
	lea	r8d,[r13*1+r8]
	xor	esi,r10d
	rorx	r14d,r9d,13
	rorx	r13d,r9d,2
	lea	eax,[r8*1+rax]
	and	r15d,esi
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((48-128))+rdi]
	xor	r14d,r12d
	xor	r15d,r10d
	xor	r14d,r13d
	lea	r8d,[r15*1+r8]
	mov	r12d,ebx
	add	edx,DWORD[((32+64))+rsp]
	and	r12d,eax
	rorx	r13d,eax,25
	rorx	r15d,eax,11
	lea	r8d,[r14*1+r8]
	lea	edx,[r12*1+rdx]
	andn	r12d,eax,ecx
	xor	r13d,r15d
	rorx	r14d,eax,6
	lea	edx,[r12*1+rdx]
	xor	r13d,r14d
	mov	r15d,r8d
	rorx	r12d,r8d,22
	lea	edx,[r13*1+rdx]
	xor	r15d,r9d
	rorx	r14d,r8d,13
	rorx	r13d,r8d,2
	lea	r11d,[rdx*1+r11]
	and	esi,r15d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((64-128))+rdi]
	xor	r14d,r12d
	xor	esi,r9d
	xor	r14d,r13d
	lea	edx,[rsi*1+rdx]
	mov	r12d,eax
	add	ecx,DWORD[((36+64))+rsp]
	and	r12d,r11d
	rorx	r13d,r11d,25
	rorx	esi,r11d,11
	lea	edx,[r14*1+rdx]
	lea	ecx,[r12*1+rcx]
	andn	r12d,r11d,ebx
	xor	r13d,esi
	rorx	r14d,r11d,6
	lea	ecx,[r12*1+rcx]
	xor	r13d,r14d
	mov	esi,edx
	rorx	r12d,edx,22
	lea	ecx,[r13*1+rcx]
	xor	esi,r8d
	rorx	r14d,edx,13
	rorx	r13d,edx,2
	lea	r10d,[rcx*1+r10]
	and	r15d,esi
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((80-128))+rdi]
	xor	r14d,r12d
	xor	r15d,r8d
	xor	r14d,r13d
	lea	ecx,[r15*1+rcx]
	mov	r12d,r11d
	add	ebx,DWORD[((40+64))+rsp]
	and	r12d,r10d
	rorx	r13d,r10d,25
	rorx	r15d,r10d,11
	lea	ecx,[r14*1+rcx]
	lea	ebx,[r12*1+rbx]
	andn	r12d,r10d,eax
	xor	r13d,r15d
	rorx	r14d,r10d,6
	lea	ebx,[r12*1+rbx]
	xor	r13d,r14d
	mov	r15d,ecx
	rorx	r12d,ecx,22
	lea	ebx,[r13*1+rbx]
	xor	r15d,edx
	rorx	r14d,ecx,13
	rorx	r13d,ecx,2
	lea	r9d,[rbx*1+r9]
	and	esi,r15d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((96-128))+rdi]
	xor	r14d,r12d
	xor	esi,edx
	xor	r14d,r13d
	lea	ebx,[rsi*1+rbx]
	mov	r12d,r10d
	add	eax,DWORD[((44+64))+rsp]
	and	r12d,r9d
	rorx	r13d,r9d,25
	rorx	esi,r9d,11
	lea	ebx,[r14*1+rbx]
	lea	eax,[r12*1+rax]
	andn	r12d,r9d,r11d
	xor	r13d,esi
	rorx	r14d,r9d,6
	lea	eax,[r12*1+rax]
	xor	r13d,r14d
	mov	esi,ebx
	rorx	r12d,ebx,22
	lea	eax,[r13*1+rax]
	xor	esi,ecx
	rorx	r14d,ebx,13
	rorx	r13d,ebx,2
	lea	r8d,[rax*1+r8]
	and	r15d,esi
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((112-128))+rdi]
	xor	r14d,r12d
	xor	r15d,ecx
	xor	r14d,r13d
	lea	eax,[r15*1+rax]
	mov	r12d,r9d
	add	r11d,DWORD[rsp]
	and	r12d,r8d
	rorx	r13d,r8d,25
	rorx	r15d,r8d,11
	lea	eax,[r14*1+rax]
	lea	r11d,[r12*1+r11]
	andn	r12d,r8d,r10d
	xor	r13d,r15d
	rorx	r14d,r8d,6
	lea	r11d,[r12*1+r11]
	xor	r13d,r14d
	mov	r15d,eax
	rorx	r12d,eax,22
	lea	r11d,[r13*1+r11]
	xor	r15d,ebx
	rorx	r14d,eax,13
	rorx	r13d,eax,2
	lea	edx,[r11*1+rdx]
	and	esi,r15d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((128-128))+rdi]
	xor	r14d,r12d
	xor	esi,ebx
	xor	r14d,r13d
	lea	r11d,[rsi*1+r11]
	mov	r12d,r8d
	add	r10d,DWORD[4+rsp]
	and	r12d,edx
	rorx	r13d,edx,25
	rorx	esi,edx,11
	lea	r11d,[r14*1+r11]
	lea	r10d,[r12*1+r10]
	andn	r12d,edx,r9d
	xor	r13d,esi
	rorx	r14d,edx,6
	lea	r10d,[r12*1+r10]
	xor	r13d,r14d
	mov	esi,r11d
	rorx	r12d,r11d,22
	lea	r10d,[r13*1+r10]
	xor	esi,eax
	rorx	r14d,r11d,13
	rorx	r13d,r11d,2
	lea	ecx,[r10*1+rcx]
	and	r15d,esi
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((144-128))+rdi]
	xor	r14d,r12d
	xor	r15d,eax
	xor	r14d,r13d
	lea	r10d,[r15*1+r10]
	mov	r12d,edx
	add	r9d,DWORD[8+rsp]
	and	r12d,ecx
	rorx	r13d,ecx,25
	rorx	r15d,ecx,11
	lea	r10d,[r14*1+r10]
	lea	r9d,[r12*1+r9]
	andn	r12d,ecx,r8d
	xor	r13d,r15d
	rorx	r14d,ecx,6
	lea	r9d,[r12*1+r9]
	xor	r13d,r14d
	mov	r15d,r10d
	rorx	r12d,r10d,22
	lea	r9d,[r13*1+r9]
	xor	r15d,r11d
	rorx	r14d,r10d,13
	rorx	r13d,r10d,2
	lea	ebx,[r9*1+rbx]
	and	esi,r15d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((160-128))+rdi]
	xor	r14d,r12d
	xor	esi,r11d
	xor	r14d,r13d
	lea	r9d,[rsi*1+r9]
	mov	r12d,ecx
	add	r8d,DWORD[12+rsp]
	and	r12d,ebx
	rorx	r13d,ebx,25
	rorx	esi,ebx,11
	lea	r9d,[r14*1+r9]
	lea	r8d,[r12*1+r8]
	andn	r12d,ebx,edx
	xor	r13d,esi
	rorx	r14d,ebx,6
	lea	r8d,[r12*1+r8]
	xor	r13d,r14d
	mov	esi,r9d
	rorx	r12d,r9d,22
	lea	r8d,[r13*1+r8]
	xor	esi,r10d
	rorx	r14d,r9d,13
	rorx	r13d,r9d,2
	lea	eax,[r8*1+rax]
	and	r15d,esi
	vaesenclast	xmm11,xmm9,xmm10
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((176-128))+rdi]
	xor	r14d,r12d
	xor	r15d,r10d
	xor	r14d,r13d
	lea	r8d,[r15*1+r8]
	mov	r12d,ebx
	add	edx,DWORD[32+rsp]
	and	r12d,eax
	rorx	r13d,eax,25
	rorx	r15d,eax,11
	lea	r8d,[r14*1+r8]
	lea	edx,[r12*1+rdx]
	andn	r12d,eax,ecx
	xor	r13d,r15d
	rorx	r14d,eax,6
	lea	edx,[r12*1+rdx]
	xor	r13d,r14d
	mov	r15d,r8d
	rorx	r12d,r8d,22
	lea	edx,[r13*1+rdx]
	xor	r15d,r9d
	rorx	r14d,r8d,13
	rorx	r13d,r8d,2
	lea	r11d,[rdx*1+r11]
	and	esi,r15d
	vpand	xmm8,xmm11,xmm12
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((192-128))+rdi]
	xor	r14d,r12d
	xor	esi,r9d
	xor	r14d,r13d
	lea	edx,[rsi*1+rdx]
	mov	r12d,eax
	add	ecx,DWORD[36+rsp]
	and	r12d,r11d
	rorx	r13d,r11d,25
	rorx	esi,r11d,11
	lea	edx,[r14*1+rdx]
	lea	ecx,[r12*1+rcx]
	andn	r12d,r11d,ebx
	xor	r13d,esi
	rorx	r14d,r11d,6
	lea	ecx,[r12*1+rcx]
	xor	r13d,r14d
	mov	esi,edx
	rorx	r12d,edx,22
	lea	ecx,[r13*1+rcx]
	xor	esi,r8d
	rorx	r14d,edx,13
	rorx	r13d,edx,2
	lea	r10d,[rcx*1+r10]
	and	r15d,esi
	vaesenclast	xmm11,xmm9,xmm10
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((208-128))+rdi]
	xor	r14d,r12d
	xor	r15d,r8d
	xor	r14d,r13d
	lea	ecx,[r15*1+rcx]
	mov	r12d,r11d
	add	ebx,DWORD[40+rsp]
	and	r12d,r10d
	rorx	r13d,r10d,25
	rorx	r15d,r10d,11
	lea	ecx,[r14*1+rcx]
	lea	ebx,[r12*1+rbx]
	andn	r12d,r10d,eax
	xor	r13d,r15d
	rorx	r14d,r10d,6
	lea	ebx,[r12*1+rbx]
	xor	r13d,r14d
	mov	r15d,ecx
	rorx	r12d,ecx,22
	lea	ebx,[r13*1+rbx]
	xor	r15d,edx
	rorx	r14d,ecx,13
	rorx	r13d,ecx,2
	lea	r9d,[rbx*1+r9]
	and	esi,r15d
	vpand	xmm11,xmm11,xmm13
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((224-128))+rdi]
	xor	r14d,r12d
	xor	esi,edx
	xor	r14d,r13d
	lea	ebx,[rsi*1+rbx]
	mov	r12d,r10d
	add	eax,DWORD[44+rsp]
	and	r12d,r9d
	rorx	r13d,r9d,25
	rorx	esi,r9d,11
	lea	ebx,[r14*1+rbx]
	lea	eax,[r12*1+rax]
	andn	r12d,r9d,r11d
	xor	r13d,esi
	rorx	r14d,r9d,6
	lea	eax,[r12*1+rax]
	xor	r13d,r14d
	mov	esi,ebx
	rorx	r12d,ebx,22
	lea	eax,[r13*1+rax]
	xor	esi,ecx
	rorx	r14d,ebx,13
	rorx	r13d,ebx,2
	lea	r8d,[rax*1+r8]
	and	r15d,esi
	vpor	xmm8,xmm8,xmm11
	vaesenclast	xmm11,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((0-128))+rdi]
	xor	r14d,r12d
	xor	r15d,ecx
	xor	r14d,r13d
	lea	eax,[r15*1+rax]
	mov	r12d,r9d
	vpextrq	r12,xmm15,1
	vmovq	r13,xmm15
	mov	r15,QWORD[552+rsp]
	add	eax,r14d
	lea	rbp,[448+rsp]

	vpand	xmm11,xmm11,xmm14
	vpor	xmm8,xmm8,xmm11
	vmovdqu	XMMWORD[r13*1+r12],xmm8
	lea	r13,[16+r13]

	add	eax,DWORD[r15]
	add	ebx,DWORD[4+r15]
	add	ecx,DWORD[8+r15]
	add	edx,DWORD[12+r15]
	add	r8d,DWORD[16+r15]
	add	r9d,DWORD[20+r15]
	add	r10d,DWORD[24+r15]
	add	r11d,DWORD[28+r15]

	mov	DWORD[r15],eax
	mov	DWORD[4+r15],ebx
	mov	DWORD[8+r15],ecx
	mov	DWORD[12+r15],edx
	mov	DWORD[16+r15],r8d
	mov	DWORD[20+r15],r9d
	mov	DWORD[24+r15],r10d
	mov	DWORD[28+r15],r11d

	cmp	r13,QWORD[80+rbp]
	je	NEAR $L$done_avx2

	xor	r14d,r14d
	mov	esi,ebx
	mov	r12d,r9d
	xor	esi,ecx
	jmp	NEAR $L$ower_avx2
ALIGN	16
$L$ower_avx2:
	vmovdqu	xmm9,XMMWORD[r13]
	vpinsrq	xmm15,xmm15,r13,0
	add	r11d,DWORD[((0+16))+rbp]
	and	r12d,r8d
	rorx	r13d,r8d,25
	rorx	r15d,r8d,11
	lea	eax,[r14*1+rax]
	lea	r11d,[r12*1+r11]
	andn	r12d,r8d,r10d
	xor	r13d,r15d
	rorx	r14d,r8d,6
	lea	r11d,[r12*1+r11]
	xor	r13d,r14d
	mov	r15d,eax
	rorx	r12d,eax,22
	lea	r11d,[r13*1+r11]
	xor	r15d,ebx
	rorx	r14d,eax,13
	rorx	r13d,eax,2
	lea	edx,[r11*1+rdx]
	and	esi,r15d
	vpxor	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((16-128))+rdi]
	xor	r14d,r12d
	xor	esi,ebx
	xor	r14d,r13d
	lea	r11d,[rsi*1+r11]
	mov	r12d,r8d
	add	r10d,DWORD[((4+16))+rbp]
	and	r12d,edx
	rorx	r13d,edx,25
	rorx	esi,edx,11
	lea	r11d,[r14*1+r11]
	lea	r10d,[r12*1+r10]
	andn	r12d,edx,r9d
	xor	r13d,esi
	rorx	r14d,edx,6
	lea	r10d,[r12*1+r10]
	xor	r13d,r14d
	mov	esi,r11d
	rorx	r12d,r11d,22
	lea	r10d,[r13*1+r10]
	xor	esi,eax
	rorx	r14d,r11d,13
	rorx	r13d,r11d,2
	lea	ecx,[r10*1+rcx]
	and	r15d,esi
	vpxor	xmm9,xmm9,xmm8
	xor	r14d,r12d
	xor	r15d,eax
	xor	r14d,r13d
	lea	r10d,[r15*1+r10]
	mov	r12d,edx
	add	r9d,DWORD[((8+16))+rbp]
	and	r12d,ecx
	rorx	r13d,ecx,25
	rorx	r15d,ecx,11
	lea	r10d,[r14*1+r10]
	lea	r9d,[r12*1+r9]
	andn	r12d,ecx,r8d
	xor	r13d,r15d
	rorx	r14d,ecx,6
	lea	r9d,[r12*1+r9]
	xor	r13d,r14d
	mov	r15d,r10d
	rorx	r12d,r10d,22
	lea	r9d,[r13*1+r9]
	xor	r15d,r11d
	rorx	r14d,r10d,13
	rorx	r13d,r10d,2
	lea	ebx,[r9*1+rbx]
	and	esi,r15d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((32-128))+rdi]
	xor	r14d,r12d
	xor	esi,r11d
	xor	r14d,r13d
	lea	r9d,[rsi*1+r9]
	mov	r12d,ecx
	add	r8d,DWORD[((12+16))+rbp]
	and	r12d,ebx
	rorx	r13d,ebx,25
	rorx	esi,ebx,11
	lea	r9d,[r14*1+r9]
	lea	r8d,[r12*1+r8]
	andn	r12d,ebx,edx
	xor	r13d,esi
	rorx	r14d,ebx,6
	lea	r8d,[r12*1+r8]
	xor	r13d,r14d
	mov	esi,r9d
	rorx	r12d,r9d,22
	lea	r8d,[r13*1+r8]
	xor	esi,r10d
	rorx	r14d,r9d,13
	rorx	r13d,r9d,2
	lea	eax,[r8*1+rax]
	and	r15d,esi
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((48-128))+rdi]
	xor	r14d,r12d
	xor	r15d,r10d
	xor	r14d,r13d
	lea	r8d,[r15*1+r8]
	mov	r12d,ebx
	add	edx,DWORD[((32+16))+rbp]
	and	r12d,eax
	rorx	r13d,eax,25
	rorx	r15d,eax,11
	lea	r8d,[r14*1+r8]
	lea	edx,[r12*1+rdx]
	andn	r12d,eax,ecx
	xor	r13d,r15d
	rorx	r14d,eax,6
	lea	edx,[r12*1+rdx]
	xor	r13d,r14d
	mov	r15d,r8d
	rorx	r12d,r8d,22
	lea	edx,[r13*1+rdx]
	xor	r15d,r9d
	rorx	r14d,r8d,13
	rorx	r13d,r8d,2
	lea	r11d,[rdx*1+r11]
	and	esi,r15d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((64-128))+rdi]
	xor	r14d,r12d
	xor	esi,r9d
	xor	r14d,r13d
	lea	edx,[rsi*1+rdx]
	mov	r12d,eax
	add	ecx,DWORD[((36+16))+rbp]
	and	r12d,r11d
	rorx	r13d,r11d,25
	rorx	esi,r11d,11
	lea	edx,[r14*1+rdx]
	lea	ecx,[r12*1+rcx]
	andn	r12d,r11d,ebx
	xor	r13d,esi
	rorx	r14d,r11d,6
	lea	ecx,[r12*1+rcx]
	xor	r13d,r14d
	mov	esi,edx
	rorx	r12d,edx,22
	lea	ecx,[r13*1+rcx]
	xor	esi,r8d
	rorx	r14d,edx,13
	rorx	r13d,edx,2
	lea	r10d,[rcx*1+r10]
	and	r15d,esi
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((80-128))+rdi]
	xor	r14d,r12d
	xor	r15d,r8d
	xor	r14d,r13d
	lea	ecx,[r15*1+rcx]
	mov	r12d,r11d
	add	ebx,DWORD[((40+16))+rbp]
	and	r12d,r10d
	rorx	r13d,r10d,25
	rorx	r15d,r10d,11
	lea	ecx,[r14*1+rcx]
	lea	ebx,[r12*1+rbx]
	andn	r12d,r10d,eax
	xor	r13d,r15d
	rorx	r14d,r10d,6
	lea	ebx,[r12*1+rbx]
	xor	r13d,r14d
	mov	r15d,ecx
	rorx	r12d,ecx,22
	lea	ebx,[r13*1+rbx]
	xor	r15d,edx
	rorx	r14d,ecx,13
	rorx	r13d,ecx,2
	lea	r9d,[rbx*1+r9]
	and	esi,r15d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((96-128))+rdi]
	xor	r14d,r12d
	xor	esi,edx
	xor	r14d,r13d
	lea	ebx,[rsi*1+rbx]
	mov	r12d,r10d
	add	eax,DWORD[((44+16))+rbp]
	and	r12d,r9d
	rorx	r13d,r9d,25
	rorx	esi,r9d,11
	lea	ebx,[r14*1+rbx]
	lea	eax,[r12*1+rax]
	andn	r12d,r9d,r11d
	xor	r13d,esi
	rorx	r14d,r9d,6
	lea	eax,[r12*1+rax]
	xor	r13d,r14d
	mov	esi,ebx
	rorx	r12d,ebx,22
	lea	eax,[r13*1+rax]
	xor	esi,ecx
	rorx	r14d,ebx,13
	rorx	r13d,ebx,2
	lea	r8d,[rax*1+r8]
	and	r15d,esi
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((112-128))+rdi]
	xor	r14d,r12d
	xor	r15d,ecx
	xor	r14d,r13d
	lea	eax,[r15*1+rax]
	mov	r12d,r9d
	lea	rbp,[((-64))+rbp]
	add	r11d,DWORD[((0+16))+rbp]
	and	r12d,r8d
	rorx	r13d,r8d,25
	rorx	r15d,r8d,11
	lea	eax,[r14*1+rax]
	lea	r11d,[r12*1+r11]
	andn	r12d,r8d,r10d
	xor	r13d,r15d
	rorx	r14d,r8d,6
	lea	r11d,[r12*1+r11]
	xor	r13d,r14d
	mov	r15d,eax
	rorx	r12d,eax,22
	lea	r11d,[r13*1+r11]
	xor	r15d,ebx
	rorx	r14d,eax,13
	rorx	r13d,eax,2
	lea	edx,[r11*1+rdx]
	and	esi,r15d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((128-128))+rdi]
	xor	r14d,r12d
	xor	esi,ebx
	xor	r14d,r13d
	lea	r11d,[rsi*1+r11]
	mov	r12d,r8d
	add	r10d,DWORD[((4+16))+rbp]
	and	r12d,edx
	rorx	r13d,edx,25
	rorx	esi,edx,11
	lea	r11d,[r14*1+r11]
	lea	r10d,[r12*1+r10]
	andn	r12d,edx,r9d
	xor	r13d,esi
	rorx	r14d,edx,6
	lea	r10d,[r12*1+r10]
	xor	r13d,r14d
	mov	esi,r11d
	rorx	r12d,r11d,22
	lea	r10d,[r13*1+r10]
	xor	esi,eax
	rorx	r14d,r11d,13
	rorx	r13d,r11d,2
	lea	ecx,[r10*1+rcx]
	and	r15d,esi
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((144-128))+rdi]
	xor	r14d,r12d
	xor	r15d,eax
	xor	r14d,r13d
	lea	r10d,[r15*1+r10]
	mov	r12d,edx
	add	r9d,DWORD[((8+16))+rbp]
	and	r12d,ecx
	rorx	r13d,ecx,25
	rorx	r15d,ecx,11
	lea	r10d,[r14*1+r10]
	lea	r9d,[r12*1+r9]
	andn	r12d,ecx,r8d
	xor	r13d,r15d
	rorx	r14d,ecx,6
	lea	r9d,[r12*1+r9]
	xor	r13d,r14d
	mov	r15d,r10d
	rorx	r12d,r10d,22
	lea	r9d,[r13*1+r9]
	xor	r15d,r11d
	rorx	r14d,r10d,13
	rorx	r13d,r10d,2
	lea	ebx,[r9*1+rbx]
	and	esi,r15d
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((160-128))+rdi]
	xor	r14d,r12d
	xor	esi,r11d
	xor	r14d,r13d
	lea	r9d,[rsi*1+r9]
	mov	r12d,ecx
	add	r8d,DWORD[((12+16))+rbp]
	and	r12d,ebx
	rorx	r13d,ebx,25
	rorx	esi,ebx,11
	lea	r9d,[r14*1+r9]
	lea	r8d,[r12*1+r8]
	andn	r12d,ebx,edx
	xor	r13d,esi
	rorx	r14d,ebx,6
	lea	r8d,[r12*1+r8]
	xor	r13d,r14d
	mov	esi,r9d
	rorx	r12d,r9d,22
	lea	r8d,[r13*1+r8]
	xor	esi,r10d
	rorx	r14d,r9d,13
	rorx	r13d,r9d,2
	lea	eax,[r8*1+rax]
	and	r15d,esi
	vaesenclast	xmm11,xmm9,xmm10
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((176-128))+rdi]
	xor	r14d,r12d
	xor	r15d,r10d
	xor	r14d,r13d
	lea	r8d,[r15*1+r8]
	mov	r12d,ebx
	add	edx,DWORD[((32+16))+rbp]
	and	r12d,eax
	rorx	r13d,eax,25
	rorx	r15d,eax,11
	lea	r8d,[r14*1+r8]
	lea	edx,[r12*1+rdx]
	andn	r12d,eax,ecx
	xor	r13d,r15d
	rorx	r14d,eax,6
	lea	edx,[r12*1+rdx]
	xor	r13d,r14d
	mov	r15d,r8d
	rorx	r12d,r8d,22
	lea	edx,[r13*1+rdx]
	xor	r15d,r9d
	rorx	r14d,r8d,13
	rorx	r13d,r8d,2
	lea	r11d,[rdx*1+r11]
	and	esi,r15d
	vpand	xmm8,xmm11,xmm12
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((192-128))+rdi]
	xor	r14d,r12d
	xor	esi,r9d
	xor	r14d,r13d
	lea	edx,[rsi*1+rdx]
	mov	r12d,eax
	add	ecx,DWORD[((36+16))+rbp]
	and	r12d,r11d
	rorx	r13d,r11d,25
	rorx	esi,r11d,11
	lea	edx,[r14*1+rdx]
	lea	ecx,[r12*1+rcx]
	andn	r12d,r11d,ebx
	xor	r13d,esi
	rorx	r14d,r11d,6
	lea	ecx,[r12*1+rcx]
	xor	r13d,r14d
	mov	esi,edx
	rorx	r12d,edx,22
	lea	ecx,[r13*1+rcx]
	xor	esi,r8d
	rorx	r14d,edx,13
	rorx	r13d,edx,2
	lea	r10d,[rcx*1+r10]
	and	r15d,esi
	vaesenclast	xmm11,xmm9,xmm10
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((208-128))+rdi]
	xor	r14d,r12d
	xor	r15d,r8d
	xor	r14d,r13d
	lea	ecx,[r15*1+rcx]
	mov	r12d,r11d
	add	ebx,DWORD[((40+16))+rbp]
	and	r12d,r10d
	rorx	r13d,r10d,25
	rorx	r15d,r10d,11
	lea	ecx,[r14*1+rcx]
	lea	ebx,[r12*1+rbx]
	andn	r12d,r10d,eax
	xor	r13d,r15d
	rorx	r14d,r10d,6
	lea	ebx,[r12*1+rbx]
	xor	r13d,r14d
	mov	r15d,ecx
	rorx	r12d,ecx,22
	lea	ebx,[r13*1+rbx]
	xor	r15d,edx
	rorx	r14d,ecx,13
	rorx	r13d,ecx,2
	lea	r9d,[rbx*1+r9]
	and	esi,r15d
	vpand	xmm11,xmm11,xmm13
	vaesenc	xmm9,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((224-128))+rdi]
	xor	r14d,r12d
	xor	esi,edx
	xor	r14d,r13d
	lea	ebx,[rsi*1+rbx]
	mov	r12d,r10d
	add	eax,DWORD[((44+16))+rbp]
	and	r12d,r9d
	rorx	r13d,r9d,25
	rorx	esi,r9d,11
	lea	ebx,[r14*1+rbx]
	lea	eax,[r12*1+rax]
	andn	r12d,r9d,r11d
	xor	r13d,esi
	rorx	r14d,r9d,6
	lea	eax,[r12*1+rax]
	xor	r13d,r14d
	mov	esi,ebx
	rorx	r12d,ebx,22
	lea	eax,[r13*1+rax]
	xor	esi,ecx
	rorx	r14d,ebx,13
	rorx	r13d,ebx,2
	lea	r8d,[rax*1+r8]
	and	r15d,esi
	vpor	xmm8,xmm8,xmm11
	vaesenclast	xmm11,xmm9,xmm10
	vmovdqu	xmm10,XMMWORD[((0-128))+rdi]
	xor	r14d,r12d
	xor	r15d,ecx
	xor	r14d,r13d
	lea	eax,[r15*1+rax]
	mov	r12d,r9d
	vmovq	r13,xmm15
	vpextrq	r15,xmm15,1
	vpand	xmm11,xmm11,xmm14
	vpor	xmm8,xmm8,xmm11
	lea	rbp,[((-64))+rbp]
	vmovdqu	XMMWORD[r13*1+r15],xmm8
	lea	r13,[16+r13]
	cmp	rbp,rsp
	jae	NEAR $L$ower_avx2

	mov	r15,QWORD[552+rsp]
	lea	r13,[64+r13]
	mov	rsi,QWORD[560+rsp]
	add	eax,r14d
	lea	rsp,[448+rsp]

	add	eax,DWORD[r15]
	add	ebx,DWORD[4+r15]
	add	ecx,DWORD[8+r15]
	add	edx,DWORD[12+r15]
	add	r8d,DWORD[16+r15]
	add	r9d,DWORD[20+r15]
	add	r10d,DWORD[24+r15]
	lea	r12,[r13*1+rsi]
	add	r11d,DWORD[28+r15]

	cmp	r13,QWORD[((64+16))+rsp]

	mov	DWORD[r15],eax
	cmove	r12,rsp
	mov	DWORD[4+r15],ebx
	mov	DWORD[8+r15],ecx
	mov	DWORD[12+r15],edx
	mov	DWORD[16+r15],r8d
	mov	DWORD[20+r15],r9d
	mov	DWORD[24+r15],r10d
	mov	DWORD[28+r15],r11d

	jbe	NEAR $L$oop_avx2
	lea	rbp,[rsp]

$L$done_avx2:
	lea	rsp,[rbp]
	mov	r8,QWORD[((64+32))+rsp]
	mov	rsi,QWORD[((64+56))+rsp]
	vmovdqu	XMMWORD[r8],xmm8
	vzeroall
	movaps	xmm6,XMMWORD[128+rsp]
	movaps	xmm7,XMMWORD[144+rsp]
	movaps	xmm8,XMMWORD[160+rsp]
	movaps	xmm9,XMMWORD[176+rsp]
	movaps	xmm10,XMMWORD[192+rsp]
	movaps	xmm11,XMMWORD[208+rsp]
	movaps	xmm12,XMMWORD[224+rsp]
	movaps	xmm13,XMMWORD[240+rsp]
	movaps	xmm14,XMMWORD[256+rsp]
	movaps	xmm15,XMMWORD[272+rsp]
	mov	r15,QWORD[rsi]
	mov	r14,QWORD[8+rsi]
	mov	r13,QWORD[16+rsi]
	mov	r12,QWORD[24+rsi]
	mov	rbp,QWORD[32+rsi]
	mov	rbx,QWORD[40+rsi]
	lea	rsp,[48+rsi]
$L$epilogue_avx2:
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	DB	0F3h,0C3h		;repret
$L$SEH_end_aesni_cbc_sha256_enc_avx2:

ALIGN	32
aesni_cbc_sha256_enc_shaext:
	mov	QWORD[8+rsp],rdi	;WIN64 prologue
	mov	QWORD[16+rsp],rsi
	mov	rax,rsp
$L$SEH_begin_aesni_cbc_sha256_enc_shaext:
	mov	rdi,rcx
	mov	rsi,rdx
	mov	rdx,r8
	mov	rcx,r9
	mov	r8,QWORD[40+rsp]
	mov	r9,QWORD[48+rsp]


	mov	r10,QWORD[56+rsp]
	lea	rsp,[((-168))+rsp]
	movaps	XMMWORD[(-8-160)+rax],xmm6
	movaps	XMMWORD[(-8-144)+rax],xmm7
	movaps	XMMWORD[(-8-128)+rax],xmm8
	movaps	XMMWORD[(-8-112)+rax],xmm9
	movaps	XMMWORD[(-8-96)+rax],xmm10
	movaps	XMMWORD[(-8-80)+rax],xmm11
	movaps	XMMWORD[(-8-64)+rax],xmm12
	movaps	XMMWORD[(-8-48)+rax],xmm13
	movaps	XMMWORD[(-8-32)+rax],xmm14
	movaps	XMMWORD[(-8-16)+rax],xmm15
$L$prologue_shaext:
	lea	rax,[((K256+128))]
	movdqu	xmm1,XMMWORD[r9]
	movdqu	xmm2,XMMWORD[16+r9]
	movdqa	xmm3,XMMWORD[((512-128))+rax]

	mov	r11d,DWORD[240+rcx]
	sub	rsi,rdi
	movups	xmm15,XMMWORD[rcx]
	movups	xmm6,XMMWORD[r8]
	movups	xmm4,XMMWORD[16+rcx]
	lea	rcx,[112+rcx]

	pshufd	xmm0,xmm1,0x1b
	pshufd	xmm1,xmm1,0xb1
	pshufd	xmm2,xmm2,0x1b
	movdqa	xmm7,xmm3
DB	102,15,58,15,202,8
	punpcklqdq	xmm2,xmm0

	jmp	NEAR $L$oop_shaext

ALIGN	16
$L$oop_shaext:
	movdqu	xmm10,XMMWORD[r10]
	movdqu	xmm11,XMMWORD[16+r10]
	movdqu	xmm12,XMMWORD[32+r10]
DB	102,68,15,56,0,211
	movdqu	xmm13,XMMWORD[48+r10]

	movdqa	xmm0,XMMWORD[((0-128))+rax]
	paddd	xmm0,xmm10
DB	102,68,15,56,0,219
	movdqa	xmm9,xmm2
	movdqa	xmm8,xmm1
	movups	xmm14,XMMWORD[rdi]
	xorps	xmm14,xmm15
	xorps	xmm6,xmm14
	movups	xmm5,XMMWORD[((-80))+rcx]
	aesenc	xmm6,xmm4
DB	15,56,203,209
	pshufd	xmm0,xmm0,0x0e
	movups	xmm4,XMMWORD[((-64))+rcx]
	aesenc	xmm6,xmm5
DB	15,56,203,202

	movdqa	xmm0,XMMWORD[((32-128))+rax]
	paddd	xmm0,xmm11
DB	102,68,15,56,0,227
	lea	r10,[64+r10]
	movups	xmm5,XMMWORD[((-48))+rcx]
	aesenc	xmm6,xmm4
DB	15,56,203,209
	pshufd	xmm0,xmm0,0x0e
	movups	xmm4,XMMWORD[((-32))+rcx]
	aesenc	xmm6,xmm5
DB	15,56,203,202

	movdqa	xmm0,XMMWORD[((64-128))+rax]
	paddd	xmm0,xmm12
DB	102,68,15,56,0,235
DB	69,15,56,204,211
	movups	xmm5,XMMWORD[((-16))+rcx]
	aesenc	xmm6,xmm4
DB	15,56,203,209
	pshufd	xmm0,xmm0,0x0e
	movdqa	xmm3,xmm13
DB	102,65,15,58,15,220,4
	paddd	xmm10,xmm3
	movups	xmm4,XMMWORD[rcx]
	aesenc	xmm6,xmm5
DB	15,56,203,202

	movdqa	xmm0,XMMWORD[((96-128))+rax]
	paddd	xmm0,xmm13
DB	69,15,56,205,213
DB	69,15,56,204,220
	movups	xmm5,XMMWORD[16+rcx]
	aesenc	xmm6,xmm4
DB	15,56,203,209
	pshufd	xmm0,xmm0,0x0e
	movups	xmm4,XMMWORD[32+rcx]
	aesenc	xmm6,xmm5
	movdqa	xmm3,xmm10
DB	102,65,15,58,15,221,4
	paddd	xmm11,xmm3
DB	15,56,203,202
	movdqa	xmm0,XMMWORD[((128-128))+rax]
	paddd	xmm0,xmm10
DB	69,15,56,205,218
DB	69,15,56,204,229
	movups	xmm5,XMMWORD[48+rcx]
	aesenc	xmm6,xmm4
DB	15,56,203,209
	pshufd	xmm0,xmm0,0x0e
	movdqa	xmm3,xmm11
DB	102,65,15,58,15,218,4
	paddd	xmm12,xmm3
	cmp	r11d,11
	jb	NEAR $L$aesenclast1
	movups	xmm4,XMMWORD[64+rcx]
	aesenc	xmm6,xmm5
	movups	xmm5,XMMWORD[80+rcx]
	aesenc	xmm6,xmm4
	je	NEAR $L$aesenclast1
	movups	xmm4,XMMWORD[96+rcx]
	aesenc	xmm6,xmm5
	movups	xmm5,XMMWORD[112+rcx]
	aesenc	xmm6,xmm4
$L$aesenclast1:
	aesenclast	xmm6,xmm5
	movups	xmm4,XMMWORD[((16-112))+rcx]
	nop
DB	15,56,203,202
	movups	xmm14,XMMWORD[16+rdi]
	xorps	xmm14,xmm15
	movups	XMMWORD[rdi*1+rsi],xmm6
	xorps	xmm6,xmm14
	movups	xmm5,XMMWORD[((-80))+rcx]
	aesenc	xmm6,xmm4
	movdqa	xmm0,XMMWORD[((160-128))+rax]
	paddd	xmm0,xmm11
DB	69,15,56,205,227
DB	69,15,56,204,234
	movups	xmm4,XMMWORD[((-64))+rcx]
	aesenc	xmm6,xmm5
DB	15,56,203,209
	pshufd	xmm0,xmm0,0x0e
	movdqa	xmm3,xmm12
DB	102,65,15,58,15,219,4
	paddd	xmm13,xmm3
	movups	xmm5,XMMWORD[((-48))+rcx]
	aesenc	xmm6,xmm4
DB	15,56,203,202
	movdqa	xmm0,XMMWORD[((192-128))+rax]
	paddd	xmm0,xmm12
DB	69,15,56,205,236
DB	69,15,56,204,211
	movups	xmm4,XMMWORD[((-32))+rcx]
	aesenc	xmm6,xmm5
DB	15,56,203,209
	pshufd	xmm0,xmm0,0x0e
	movdqa	xmm3,xmm13
DB	102,65,15,58,15,220,4
	paddd	xmm10,xmm3
	movups	xmm5,XMMWORD[((-16))+rcx]
	aesenc	xmm6,xmm4
DB	15,56,203,202
	movdqa	xmm0,XMMWORD[((224-128))+rax]
	paddd	xmm0,xmm13
DB	69,15,56,205,213
DB	69,15,56,204,220
	movups	xmm4,XMMWORD[rcx]
	aesenc	xmm6,xmm5
DB	15,56,203,209
	pshufd	xmm0,xmm0,0x0e
	movdqa	xmm3,xmm10
DB	102,65,15,58,15,221,4
	paddd	xmm11,xmm3
	movups	xmm5,XMMWORD[16+rcx]
	aesenc	xmm6,xmm4
DB	15,56,203,202
	movdqa	xmm0,XMMWORD[((256-128))+rax]
	paddd	xmm0,xmm10
DB	69,15,56,205,218
DB	69,15,56,204,229
	movups	xmm4,XMMWORD[32+rcx]
	aesenc	xmm6,xmm5
DB	15,56,203,209
	pshufd	xmm0,xmm0,0x0e
	movdqa	xmm3,xmm11
DB	102,65,15,58,15,218,4
	paddd	xmm12,xmm3
	movups	xmm5,XMMWORD[48+rcx]
	aesenc	xmm6,xmm4
	cmp	r11d,11
	jb	NEAR $L$aesenclast2
	movups	xmm4,XMMWORD[64+rcx]
	aesenc	xmm6,xmm5
	movups	xmm5,XMMWORD[80+rcx]
	aesenc	xmm6,xmm4
	je	NEAR $L$aesenclast2
	movups	xmm4,XMMWORD[96+rcx]
	aesenc	xmm6,xmm5
	movups	xmm5,XMMWORD[112+rcx]
	aesenc	xmm6,xmm4
$L$aesenclast2:
	aesenclast	xmm6,xmm5
	movups	xmm4,XMMWORD[((16-112))+rcx]
	nop
DB	15,56,203,202
	movups	xmm14,XMMWORD[32+rdi]
	xorps	xmm14,xmm15
	movups	XMMWORD[16+rdi*1+rsi],xmm6
	xorps	xmm6,xmm14
	movups	xmm5,XMMWORD[((-80))+rcx]
	aesenc	xmm6,xmm4
	movdqa	xmm0,XMMWORD[((288-128))+rax]
	paddd	xmm0,xmm11
DB	69,15,56,205,227
DB	69,15,56,204,234
	movups	xmm4,XMMWORD[((-64))+rcx]
	aesenc	xmm6,xmm5
DB	15,56,203,209
	pshufd	xmm0,xmm0,0x0e
	movdqa	xmm3,xmm12
DB	102,65,15,58,15,219,4
	paddd	xmm13,xmm3
	movups	xmm5,XMMWORD[((-48))+rcx]
	aesenc	xmm6,xmm4
DB	15,56,203,202
	movdqa	xmm0,XMMWORD[((320-128))+rax]
	paddd	xmm0,xmm12
DB	69,15,56,205,236
DB	69,15,56,204,211
	movups	xmm4,XMMWORD[((-32))+rcx]
	aesenc	xmm6,xmm5
DB	15,56,203,209
	pshufd	xmm0,xmm0,0x0e
	movdqa	xmm3,xmm13
DB	102,65,15,58,15,220,4
	paddd	xmm10,xmm3
	movups	xmm5,XMMWORD[((-16))+rcx]
	aesenc	xmm6,xmm4
DB	15,56,203,202
	movdqa	xmm0,XMMWORD[((352-128))+rax]
	paddd	xmm0,xmm13
DB	69,15,56,205,213
DB	69,15,56,204,220
	movups	xmm4,XMMWORD[rcx]
	aesenc	xmm6,xmm5
DB	15,56,203,209
	pshufd	xmm0,xmm0,0x0e
	movdqa	xmm3,xmm10
DB	102,65,15,58,15,221,4
	paddd	xmm11,xmm3
	movups	xmm5,XMMWORD[16+rcx]
	aesenc	xmm6,xmm4
DB	15,56,203,202
	movdqa	xmm0,XMMWORD[((384-128))+rax]
	paddd	xmm0,xmm10
DB	69,15,56,205,218
DB	69,15,56,204,229
	movups	xmm4,XMMWORD[32+rcx]
	aesenc	xmm6,xmm5
DB	15,56,203,209
	pshufd	xmm0,xmm0,0x0e
	movdqa	xmm3,xmm11
DB	102,65,15,58,15,218,4
	paddd	xmm12,xmm3
	movups	xmm5,XMMWORD[48+rcx]
	aesenc	xmm6,xmm4
DB	15,56,203,202
	movdqa	xmm0,XMMWORD[((416-128))+rax]
	paddd	xmm0,xmm11
DB	69,15,56,205,227
DB	69,15,56,204,234
	cmp	r11d,11
	jb	NEAR $L$aesenclast3
	movups	xmm4,XMMWORD[64+rcx]
	aesenc	xmm6,xmm5
	movups	xmm5,XMMWORD[80+rcx]
	aesenc	xmm6,xmm4
	je	NEAR $L$aesenclast3
	movups	xmm4,XMMWORD[96+rcx]
	aesenc	xmm6,xmm5
	movups	xmm5,XMMWORD[112+rcx]
	aesenc	xmm6,xmm4
$L$aesenclast3:
	aesenclast	xmm6,xmm5
	movups	xmm4,XMMWORD[((16-112))+rcx]
	nop
DB	15,56,203,209
	pshufd	xmm0,xmm0,0x0e
	movdqa	xmm3,xmm12
DB	102,65,15,58,15,219,4
	paddd	xmm13,xmm3
	movups	xmm14,XMMWORD[48+rdi]
	xorps	xmm14,xmm15
	movups	XMMWORD[32+rdi*1+rsi],xmm6
	xorps	xmm6,xmm14
	movups	xmm5,XMMWORD[((-80))+rcx]
	aesenc	xmm6,xmm4
	movups	xmm4,XMMWORD[((-64))+rcx]
	aesenc	xmm6,xmm5
DB	15,56,203,202

	movdqa	xmm0,XMMWORD[((448-128))+rax]
	paddd	xmm0,xmm12
DB	69,15,56,205,236
	movdqa	xmm3,xmm7
	movups	xmm5,XMMWORD[((-48))+rcx]
	aesenc	xmm6,xmm4
DB	15,56,203,209
	pshufd	xmm0,xmm0,0x0e
	movups	xmm4,XMMWORD[((-32))+rcx]
	aesenc	xmm6,xmm5
DB	15,56,203,202

	movdqa	xmm0,XMMWORD[((480-128))+rax]
	paddd	xmm0,xmm13
	movups	xmm5,XMMWORD[((-16))+rcx]
	aesenc	xmm6,xmm4
	movups	xmm4,XMMWORD[rcx]
	aesenc	xmm6,xmm5
DB	15,56,203,209
	pshufd	xmm0,xmm0,0x0e
	movups	xmm5,XMMWORD[16+rcx]
	aesenc	xmm6,xmm4
DB	15,56,203,202

	movups	xmm4,XMMWORD[32+rcx]
	aesenc	xmm6,xmm5
	movups	xmm5,XMMWORD[48+rcx]
	aesenc	xmm6,xmm4
	cmp	r11d,11
	jb	NEAR $L$aesenclast4
	movups	xmm4,XMMWORD[64+rcx]
	aesenc	xmm6,xmm5
	movups	xmm5,XMMWORD[80+rcx]
	aesenc	xmm6,xmm4
	je	NEAR $L$aesenclast4
	movups	xmm4,XMMWORD[96+rcx]
	aesenc	xmm6,xmm5
	movups	xmm5,XMMWORD[112+rcx]
	aesenc	xmm6,xmm4
$L$aesenclast4:
	aesenclast	xmm6,xmm5
	movups	xmm4,XMMWORD[((16-112))+rcx]
	nop

	paddd	xmm2,xmm9
	paddd	xmm1,xmm8

	dec	rdx
	movups	XMMWORD[48+rdi*1+rsi],xmm6
	lea	rdi,[64+rdi]
	jnz	NEAR $L$oop_shaext

	pshufd	xmm2,xmm2,0xb1
	pshufd	xmm3,xmm1,0x1b
	pshufd	xmm1,xmm1,0xb1
	punpckhqdq	xmm1,xmm2
DB	102,15,58,15,211,8

	movups	XMMWORD[r8],xmm6
	movdqu	XMMWORD[r9],xmm1
	movdqu	XMMWORD[16+r9],xmm2
	movaps	xmm6,XMMWORD[rsp]
	movaps	xmm7,XMMWORD[16+rsp]
	movaps	xmm8,XMMWORD[32+rsp]
	movaps	xmm9,XMMWORD[48+rsp]
	movaps	xmm10,XMMWORD[64+rsp]
	movaps	xmm11,XMMWORD[80+rsp]
	movaps	xmm12,XMMWORD[96+rsp]
	movaps	xmm13,XMMWORD[112+rsp]
	movaps	xmm14,XMMWORD[128+rsp]
	movaps	xmm15,XMMWORD[144+rsp]
	lea	rsp,[((8+160))+rsp]
$L$epilogue_shaext:
	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
	mov	rsi,QWORD[16+rsp]
	DB	0F3h,0C3h		;repret
$L$SEH_end_aesni_cbc_sha256_enc_shaext:
EXTERN	__imp_RtlVirtualUnwind

ALIGN	16
se_handler:
	push	rsi
	push	rdi
	push	rbx
	push	rbp
	push	r12
	push	r13
	push	r14
	push	r15
	pushfq
	sub	rsp,64

	mov	rax,QWORD[120+r8]
	mov	rbx,QWORD[248+r8]

	mov	rsi,QWORD[8+r9]
	mov	r11,QWORD[56+r9]

	mov	r10d,DWORD[r11]
	lea	r10,[r10*1+rsi]
	cmp	rbx,r10
	jb	NEAR $L$in_prologue

	mov	rax,QWORD[152+r8]

	mov	r10d,DWORD[4+r11]
	lea	r10,[r10*1+rsi]
	cmp	rbx,r10
	jae	NEAR $L$in_prologue
	lea	r10,[aesni_cbc_sha256_enc_shaext]
	cmp	rbx,r10
	jb	NEAR $L$not_in_shaext

	lea	rsi,[rax]
	lea	rdi,[512+r8]
	mov	ecx,20
	DD	0xa548f3fc
	lea	rax,[168+rax]
	jmp	NEAR $L$in_prologue
$L$not_in_shaext:
	lea	r10,[$L$avx2_shortcut]
	cmp	rbx,r10
	jb	NEAR $L$not_in_avx2

	and	rax,-256*4
	add	rax,448
$L$not_in_avx2:
	mov	rsi,rax
	mov	rax,QWORD[((64+56))+rax]
	lea	rax,[48+rax]

	mov	rbx,QWORD[((-8))+rax]
	mov	rbp,QWORD[((-16))+rax]
	mov	r12,QWORD[((-24))+rax]
	mov	r13,QWORD[((-32))+rax]
	mov	r14,QWORD[((-40))+rax]
	mov	r15,QWORD[((-48))+rax]
	mov	QWORD[144+r8],rbx
	mov	QWORD[160+r8],rbp
	mov	QWORD[216+r8],r12
	mov	QWORD[224+r8],r13
	mov	QWORD[232+r8],r14
	mov	QWORD[240+r8],r15

	lea	rsi,[((64+64))+rsi]
	lea	rdi,[512+r8]
	mov	ecx,20
	DD	0xa548f3fc

$L$in_prologue:
	mov	rdi,QWORD[8+rax]
	mov	rsi,QWORD[16+rax]
	mov	QWORD[152+r8],rax
	mov	QWORD[168+r8],rsi
	mov	QWORD[176+r8],rdi

	mov	rdi,QWORD[40+r9]
	mov	rsi,r8
	mov	ecx,154
	DD	0xa548f3fc

	mov	rsi,r9
	xor	rcx,rcx
	mov	rdx,QWORD[8+rsi]
	mov	r8,QWORD[rsi]
	mov	r9,QWORD[16+rsi]
	mov	r10,QWORD[40+rsi]
	lea	r11,[56+rsi]
	lea	r12,[24+rsi]
	mov	QWORD[32+rsp],r10
	mov	QWORD[40+rsp],r11
	mov	QWORD[48+rsp],r12
	mov	QWORD[56+rsp],rcx
	call	QWORD[__imp_RtlVirtualUnwind]

	mov	eax,1
	add	rsp,64
	popfq
	pop	r15
	pop	r14
	pop	r13
	pop	r12
	pop	rbp
	pop	rbx
	pop	rdi
	pop	rsi
	DB	0F3h,0C3h		;repret


section	.pdata rdata align=4
	DD	$L$SEH_begin_aesni_cbc_sha256_enc_xop wrt ..imagebase
	DD	$L$SEH_end_aesni_cbc_sha256_enc_xop wrt ..imagebase
	DD	$L$SEH_info_aesni_cbc_sha256_enc_xop wrt ..imagebase

	DD	$L$SEH_begin_aesni_cbc_sha256_enc_avx wrt ..imagebase
	DD	$L$SEH_end_aesni_cbc_sha256_enc_avx wrt ..imagebase
	DD	$L$SEH_info_aesni_cbc_sha256_enc_avx wrt ..imagebase
	DD	$L$SEH_begin_aesni_cbc_sha256_enc_avx2 wrt ..imagebase
	DD	$L$SEH_end_aesni_cbc_sha256_enc_avx2 wrt ..imagebase
	DD	$L$SEH_info_aesni_cbc_sha256_enc_avx2 wrt ..imagebase
	DD	$L$SEH_begin_aesni_cbc_sha256_enc_shaext wrt ..imagebase
	DD	$L$SEH_end_aesni_cbc_sha256_enc_shaext wrt ..imagebase
	DD	$L$SEH_info_aesni_cbc_sha256_enc_shaext wrt ..imagebase
section	.xdata rdata align=8
ALIGN	8
$L$SEH_info_aesni_cbc_sha256_enc_xop:
DB	9,0,0,0
	DD	se_handler wrt ..imagebase
	DD	$L$prologue_xop wrt ..imagebase,$L$epilogue_xop wrt ..imagebase

$L$SEH_info_aesni_cbc_sha256_enc_avx:
DB	9,0,0,0
	DD	se_handler wrt ..imagebase
	DD	$L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase
$L$SEH_info_aesni_cbc_sha256_enc_avx2:
DB	9,0,0,0
	DD	se_handler wrt ..imagebase
	DD	$L$prologue_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase
$L$SEH_info_aesni_cbc_sha256_enc_shaext:
DB	9,0,0,0
	DD	se_handler wrt ..imagebase
	DD	$L$prologue_shaext wrt ..imagebase,$L$epilogue_shaext wrt ..imagebase
